server/continuedev/libs/llm/llamacpp.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

import json
from typing import Any, Callable, Dict

from pydantic import Field

from .base import LLM
from .prompts.chat import llama2_template_messages
from .prompts.edit import simplified_edit_prompt


class LlamaCpp(LLM):
    """
    Run the llama.cpp server binary to start the API server. If running on a remote server, be sure to set host to 0.0.0.0:

    ```shell
    .\server.exe -c 4096 --host 0.0.0.0 -t 16 --mlock -m models\meta\llama\codellama-7b-instruct.Q8_0.gguf
    ```

    After it's up and running, change `~/.continue/config.py` to look like this:

    ```python title="~/.continue/config.py"
    from continuedev.libs.llm.llamacpp import LlamaCpp

    config = ContinueConfig(
        ...
        models=Models(
            default=LlamaCpp(
                max_context_length=4096,
                server_url="http://localhost:8080")
        )
    )
    ```
    """

    model: str = "llamacpp"
    server_url: str = Field("http://localhost:8080", description="URL of the server")

    llama_cpp_args: Dict[str, Any] = Field(
        {"stop": ["[INST]"]},
        description="A list of additional arguments to pass to llama.cpp. See [here](https://github.com/ggerganov/llama.cpp/tree/master/examples/server#api-endpoints) for the complete catalog of options.",
    )

    template_messages: Callable = llama2_template_messages
    prompt_templates = {
        "edit": simplified_edit_prompt,
    }

    class Config:
        arbitrary_types_allowed = True

    def collect_args(self, options) -> Any:
        args = super().collect_args(options)
        if "max_tokens" in args:
            args["n_predict"] = args["max_tokens"]
            del args["max_tokens"]
        if "frequency_penalty" in args:
            del args["frequency_penalty"]
        if "presence_penalty" in args:
            del args["presence_penalty"]

        for k, v in self.llama_cpp_args.items():
            if k not in args:
                args[k] = v

        return args

    async def _stream_complete(self, prompt, options):
        args = self.collect_args(options)
        headers = {"Content-Type": "application/json"}

        async def server_generator():
            async with self.create_client_session() as client_session:
                async with client_session.post(
                    f"{self.server_url}/completion",
                    json={"prompt": prompt, "stream": True, **args},
                    headers=headers,
                    proxy=self.proxy,
                ) as resp:
                    async for line in resp.content:
                        content = line.decode("utf-8")
                        if content.strip() == "":
                            continue
                        yield json.loads(content[6:])["content"]

        async for chunk in server_generator():
            yield chunk