Preview (#541)

* Strong typing (#533) * refactor: :recycle: get rid of continuedev.src.continuedev structure * refactor: :recycle: switching back to server folder * feat: :sparkles: make config.py imports shorter * feat: :bookmark: publish as pre-release vscode extension * refactor: :recycle: refactor and add more completion params to ui * build: :building_construction: download from preview S3 * fix: :bug: fix paths * fix: :green_heart: package:pre-release * ci: :green_heart: more time for tests * fix: :green_heart: fix build scripts * fix: :bug: fix import in run.py * fix: :bookmark: update version to try again * ci: 💚 Update package.json version [skip ci] * refactor: :fire: don't check for old extensions version * fix: :bug: small bug fixes * fix: :bug: fix config.py import paths * ci: 💚 Update package.json version [skip ci] * ci: :green_heart: platform-specific builds test #1 * feat: :green_heart: ship with binary * fix: :green_heart: fix copy statement to include.exe for windows * fix: :green_heart: cd extension before packaging * chore: :loud_sound: count tokens generated * fix: :green_heart: remove npm_config_arch * fix: :green_heart: publish as pre-release! * chore: :bookmark: update version * perf: :green_heart: hardcode distro paths * fix: :bug: fix yaml syntax error * chore: :bookmark: update version * fix: :green_heart: update permissions and version * feat: :bug: kill old server if needed * feat: :lipstick: update marketplace icon for pre-release * ci: 💚 Update package.json version [skip ci] * feat: :sparkles: auto-reload for config.py * feat: :wrench: update default config.py imports * feat: :sparkles: codelens in config.py * feat: :sparkles: select model param count from UI * ci: 💚 Update package.json version [skip ci] * feat: :sparkles: more model options, ollama error handling * perf: :zap: don't show server loading immediately * fix: :bug: fixing small UI details * ci: 💚 Update package.json version [skip ci] * feat: :rocket: headers param on LLM class * fix: :bug: fix headers for openai.;y * feat: :sparkles: highlight code on cmd+shift+L * ci: 💚 Update package.json version [skip ci] * feat: :lipstick: sticky top bar in gui.tsx * fix: :loud_sound: websocket logging and horizontal scrollbar * ci: 💚 Update package.json version [skip ci] * feat: :sparkles: allow AzureOpenAI Service through GGML * ci: 💚 Update package.json version [skip ci] * fix: :bug: fix automigration * ci: 💚 Update package.json version [skip ci] * ci: :green_heart: upload binaries in ci, download apple silicon * chore: :fire: remove notes * fix: :green_heart: use curl to download binary * fix: :green_heart: set permissions on apple silicon binary * fix: :green_heart: testing * fix: :green_heart: cleanup file * fix: :green_heart: fix preview.yaml * fix: :green_heart: only upload once per binary * fix: :green_heart: install rosetta * ci: :green_heart: download binary after tests * ci: 💚 Update package.json version [skip ci] * ci: :green_heart: prepare ci for merge to main --------- Co-authored-by: GitHub Action <action@github.com>
author: Nate Sesti <33237525+sestinj@users.noreply.github.com> 2023-10-09 18:37:27 -0700
committer: GitHub <noreply@github.com> 2023-10-09 18:37:27 -0700
commit: f09150617ed2454f3074bcf93f53aae5ae637d40 (patch)
tree: 5cfe614a64d921dfe58b049f426d67a8b832c71f /server/continuedev/libs/llm
parent: 985304a213f620cdff3f8f65f74ed7e3b79be29d (diff)
download: sncontinue-f09150617ed2454f3074bcf93f53aae5ae637d40.tar.gz
sncontinue-f09150617ed2454f3074bcf93f53aae5ae637d40.tar.bz2
sncontinue-f09150617ed2454f3074bcf93f53aae5ae637d40.zip
20 files changed, 2194 insertions, 0 deletions
diff --git a/server/continuedev/libs/llm/__init__.py b/server/continuedev/libs/llm/__init__.py
new file mode 100644
index 00000000..829ffede
--- /dev/null
+++ b/server/continuedev/libs/llm/__init__.py
@@ -0,0 +1,14 @@
+from .anthropic import AnthropicLLM  # noqa: F401
+from .ggml import GGML  # noqa: F401
+from .google_palm_api import GooglePaLMAPI  # noqa: F401
+from .hf_inference_api import HuggingFaceInferenceAPI  # noqa: F401
+from .hf_tgi import HuggingFaceTGI  # noqa: F401
+from .llamacpp import LlamaCpp  # noqa: F401
+from .ollama import Ollama  # noqa: F401
+from .openai import OpenAI  # noqa: F401
+from .openai_free_trial import OpenAIFreeTrial  # noqa: F401
+from .proxy_server import ProxyServer  # noqa: F401
+from .queued import QueuedLLM  # noqa: F401
+from .replicate import ReplicateLLM  # noqa: F401
+from .text_gen_interface import TextGenUI  # noqa: F401
+from .together import TogetherLLM  # noqa: F401
diff --git a/server/continuedev/libs/llm/anthropic.py b/server/continuedev/libs/llm/anthropic.py
new file mode 100644
index 00000000..7d0708f1
--- /dev/null
+++ b/server/continuedev/libs/llm/anthropic.py
@@ -0,0 +1,74 @@
+from typing import Any, Callable, Coroutine
+
+from anthropic import AI_PROMPT, HUMAN_PROMPT, AsyncAnthropic
+
+from .base import LLM, CompletionOptions
+from .prompts.chat import anthropic_template_messages
+
+
+class AnthropicLLM(LLM):
+    """
+    Import the `AnthropicLLM` class and set it as the default model:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.libs.llm.anthropic import AnthropicLLM
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=AnthropicLLM(api_key="<API_KEY>", model="claude-2")
+        )
+    )
+    ```
+
+    Claude 2 is not yet publicly released. You can request early access [here](https://www.anthropic.com/earlyaccess).
+
+    """
+
+    api_key: str
+    "Anthropic API key"
+
+    model: str = "claude-2"
+
+    _async_client: AsyncAnthropic = None
+
+    template_messages: Callable = anthropic_template_messages
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    async def start(self, **kwargs):
+        await super().start(**kwargs)
+        self._async_client = AsyncAnthropic(api_key=self.api_key)
+
+        if self.model == "claude-2":
+            self.context_length = 100_000
+
+    def collect_args(self, options: CompletionOptions):
+        options.stop = None
+        args = super().collect_args(options)
+
+        if "max_tokens" in args:
+            args["max_tokens_to_sample"] = args["max_tokens"]
+            del args["max_tokens"]
+        if "frequency_penalty" in args:
+            del args["frequency_penalty"]
+        if "presence_penalty" in args:
+            del args["presence_penalty"]
+        return args
+
+    async def _stream_complete(self, prompt: str, options):
+        args = self.collect_args(options)
+        prompt = f"{HUMAN_PROMPT} {prompt} {AI_PROMPT}"
+
+        async for chunk in await self._async_client.completions.create(
+            prompt=prompt, stream=True, **args
+        ):
+            yield chunk.completion
+
+    async def _complete(self, prompt: str, options) -> Coroutine[Any, Any, str]:
+        args = self.collect_args(options)
+        prompt = f"{HUMAN_PROMPT} {prompt} {AI_PROMPT}"
+        return (
+            await self._async_client.completions.create(prompt=prompt, **args)
+        ).completion
diff --git a/server/continuedev/libs/llm/base.py b/server/continuedev/libs/llm/base.py
new file mode 100644
index 00000000..d77cb9fc
--- /dev/null
+++ b/server/continuedev/libs/llm/base.py
@@ -0,0 +1,458 @@
+import ssl
+from typing import Any, Callable, Coroutine, Dict, Generator, List, Optional, Union
+
+import aiohttp
+import certifi
+from pydantic import Field, validator
+
+from ...core.main import ChatMessage
+from ...models.main import ContinueBaseModel
+from ..util.count_tokens import (
+    DEFAULT_ARGS,
+    DEFAULT_MAX_TOKENS,
+    compile_chat_messages,
+    count_tokens,
+    format_chat_messages,
+    prune_raw_prompt_from_top,
+)
+from ..util.devdata import dev_data_logger
+from ..util.telemetry import posthog_logger
+
+
+class CompletionOptions(ContinueBaseModel):
+    """Options for the completion."""
+
+    @validator(
+        "*",
+        pre=True,
+        always=True,
+    )
+    def ignore_none_and_set_default(cls, value, field):
+        return value if value is not None else field.default
+
+    model: Optional[str] = Field(None, description="The model name")
+    temperature: Optional[float] = Field(
+        None, description="The temperature of the completion."
+    )
+    top_p: Optional[float] = Field(None, description="The top_p of the completion.")
+    top_k: Optional[int] = Field(None, description="The top_k of the completion.")
+    presence_penalty: Optional[float] = Field(
+        None, description="The presence penalty Aof the completion."
+    )
+    frequency_penalty: Optional[float] = Field(
+        None, description="The frequency penalty of the completion."
+    )
+    stop: Optional[List[str]] = Field(
+        None, description="The stop tokens of the completion."
+    )
+    max_tokens: int = Field(
+        DEFAULT_MAX_TOKENS, description="The maximum number of tokens to generate."
+    )
+    functions: Optional[List[Any]] = Field(
+        None, description="The functions/tools to make available to the model."
+    )
+
+
+class LLM(ContinueBaseModel):
+    title: Optional[str] = Field(
+        None,
+        description="A title that will identify this model in the model selection dropdown",
+    )
+
+    unique_id: Optional[str] = Field(None, description="The unique ID of the user.")
+    model: str = Field(
+        ..., description="The name of the model to be used (e.g. gpt-4, codellama)"
+    )
+
+    system_message: Optional[str] = Field(
+        None, description="A system message that will always be followed by the LLM"
+    )
+
+    context_length: int = Field(
+        2048,
+        description="The maximum context length of the LLM in tokens, as counted by count_tokens.",
+    )
+
+    stop_tokens: Optional[List[str]] = Field(
+        None, description="Tokens that will stop the completion."
+    )
+    temperature: Optional[float] = Field(
+        None, description="The temperature of the completion."
+    )
+    top_p: Optional[float] = Field(None, description="The top_p of the completion.")
+    top_k: Optional[int] = Field(None, description="The top_k of the completion.")
+    presence_penalty: Optional[float] = Field(
+        None, description="The presence penalty Aof the completion."
+    )
+    frequency_penalty: Optional[float] = Field(
+        None, description="The frequency penalty of the completion."
+    )
+
+    timeout: Optional[int] = Field(
+        300,
+        description="Set the timeout for each request to the LLM. If you are running a local LLM that takes a while to respond, you might want to set this to avoid timeouts.",
+    )
+    verify_ssl: Optional[bool] = Field(
+        None, description="Whether to verify SSL certificates for requests."
+    )
+    ca_bundle_path: str = Field(
+        None,
+        description="Path to a custom CA bundle to use when making the HTTP request",
+    )
+    proxy: Optional[str] = Field(
+        None,
+        description="Proxy URL to use when making the HTTP request",
+    )
+    headers: Optional[Dict[str, str]] = Field(
+        None,
+        description="Headers to use when making the HTTP request",
+    )
+    prompt_templates: dict = Field(
+        {},
+        description='A dictionary of prompt templates that can be used to customize the behavior of the LLM in certain situations. For example, set the "edit" key in order to change the prompt that is used for the /edit slash command. Each value in the dictionary is a string templated in mustache syntax, and filled in at runtime with the variables specific to the situation. See the documentation for more information.',
+    )
+
+    template_messages: Optional[Callable[[List[Dict[str, str]]], str]] = Field(
+        None,
+        description="A function that takes a list of messages and returns a prompt. This ensures that models like llama2, which are trained on specific chat formats, will always receive input in that format.",
+    )
+    write_log: Optional[Callable[[str], None]] = Field(
+        None,
+        description="A function that is called upon every prompt and completion, by default to log to the file which can be viewed by clicking on the magnifying glass.",
+    )
+
+    api_key: Optional[str] = Field(
+        None, description="The API key for the LLM provider."
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+        extra = "allow"
+        fields = {
+            "title": {
+                "description": "A title that will identify this model in the model selection dropdown"
+            },
+            "system_message": {
+                "description": "A system message that will always be followed by the LLM"
+            },
+            "context_length": {
+                "description": "The maximum context length of the LLM in tokens, as counted by count_tokens."
+            },
+            "unique_id": {"description": "The unique ID of the user."},
+            "model": {
+                "description": "The name of the model to be used (e.g. gpt-4, codellama)"
+            },
+            "timeout": {
+                "description": "Set the timeout for each request to the LLM. If you are running a local LLM that takes a while to respond, you might want to set this to avoid timeouts."
+            },
+            "prompt_templates": {
+                "description": 'A dictionary of prompt templates that can be used to customize the behavior of the LLM in certain situations. For example, set the "edit" key in order to change the prompt that is used for the /edit slash command. Each value in the dictionary is a string templated in mustache syntax, and filled in at runtime with the variables specific to the situation. See the documentation for more information.'
+            },
+            "template_messages": {
+                "description": "A function that takes a list of messages and returns a prompt. This ensures that models like llama2, which are trained on specific chat formats, will always receive input in that format."
+            },
+            "write_log": {
+                "description": "A function that is called upon every prompt and completion, by default to log to the file which can be viewed by clicking on the magnifying glass."
+            },
+            "api_key": {"description": "The API key for the LLM provider."},
+            "verify_ssl": {
+                "description": "Whether to verify SSL certificates for requests."
+            },
+            "ca_bundle_path": {
+                "description": "Path to a custom CA bundle to use when making the HTTP request"
+            },
+            "headers": {
+                "description": "Headers to use when making the HTTP request"
+            },
+            "proxy": {"description": "Proxy URL to use when making the HTTP request"},
+            "stop_tokens": {"description": "Tokens that will stop the completion."},
+            "temperature": {
+                "description": "The sampling temperature used for generation."
+            },
+            "top_p": {
+                "description": "The top_p sampling parameter used for generation."
+            },
+            "top_k": {
+                "description": "The top_k sampling parameter used for generation."
+            },
+            "presence_penalty": {
+                "description": "The presence penalty used for completions."
+            },
+            "frequency_penalty": {
+                "description": "The frequency penalty used for completions."
+            },
+        }
+
+    def dict(self, **kwargs):
+        original_dict = super().dict(**kwargs)
+        original_dict.pop("write_log")
+        if self.template_messages is not None:
+            original_dict["template_messages"] = self.template_messages.__name__
+        original_dict.pop("unique_id")
+        original_dict["class_name"] = self.__class__.__name__
+        return original_dict
+
+    async def start(
+        self, write_log: Callable[[str], None] = None, unique_id: Optional[str] = None
+    ):
+        """Start the connection to the LLM."""
+        self.write_log = write_log
+        self.unique_id = unique_id
+
+    async def stop(self):
+        """Stop the connection to the LLM."""
+        pass
+
+    def create_client_session(self):
+        if self.verify_ssl is False:
+            return aiohttp.ClientSession(
+                connector=aiohttp.TCPConnector(verify_ssl=False),
+                timeout=aiohttp.ClientTimeout(total=self.timeout),
+                headers=self.headers
+            )
+        else:
+            ca_bundle_path = (
+                certifi.where() if self.ca_bundle_path is None else self.ca_bundle_path
+            )
+            ssl_context = ssl.create_default_context(cafile=ca_bundle_path)
+            return aiohttp.ClientSession(
+                connector=aiohttp.TCPConnector(ssl_context=ssl_context),
+                timeout=aiohttp.ClientTimeout(total=self.timeout),
+                headers=self.headers,
+            )
+
+    def collect_args(self, options: CompletionOptions) -> Dict[str, Any]:
+        """Collect the arguments for the LLM."""
+        args = {**DEFAULT_ARGS.copy(), "model": self.model}
+        args.update(options.dict(exclude_unset=True, exclude_none=True))
+        return args
+
+    def compile_chat_messages(
+        self,
+        options: CompletionOptions,
+        msgs: List[ChatMessage],
+        functions: Optional[List[Any]] = None,
+    ) -> List[Dict]:
+        return compile_chat_messages(
+            model_name=options.model,
+            msgs=msgs,
+            context_length=self.context_length,
+            max_tokens=options.max_tokens,
+            functions=functions,
+            system_message=self.system_message,
+        )
+
+    def template_prompt_like_messages(self, prompt: str) -> str:
+        if self.template_messages is None:
+            return prompt
+
+        msgs = [{"role": "user", "content": prompt}]
+        if self.system_message is not None:
+            msgs.insert(0, {"role": "system", "content": self.system_message})
+
+        return self.template_messages(msgs)
+
+    async def stream_complete(
+        self,
+        prompt: str,
+        raw: bool = False,
+        model: str = None,
+        temperature: float = None,
+        top_p: float = None,
+        top_k: int = None,
+        presence_penalty: float = None,
+        frequency_penalty: float = None,
+        stop: Optional[List[str]] = None,
+        max_tokens: Optional[int] = None,
+        functions: Optional[List[Any]] = None,
+        log: bool = True,
+    ) -> Generator[Union[Any, List, Dict], None, None]:
+        """Yield completion response, either streamed or not."""
+        options = CompletionOptions(
+            model=model or self.model,
+            temperature=temperature or self.temperature,
+            top_p=top_p or self.top_p,
+            top_k=top_k or self.top_k,
+            presence_penalty=presence_penalty or self.presence_penalty,
+            frequency_penalty=frequency_penalty or self.frequency_penalty,
+            stop=stop or self.stop_tokens,
+            max_tokens=max_tokens,
+            functions=functions,
+        )
+
+        prompt = prune_raw_prompt_from_top(
+            self.model, self.context_length, prompt, options.max_tokens
+        )
+
+        if not raw:
+            prompt = self.template_prompt_like_messages(prompt)
+
+        if log:
+            self.write_log(prompt)
+
+        completion = ""
+        async for chunk in self._stream_complete(prompt=prompt, options=options):
+            yield chunk
+            completion += chunk
+
+        # if log:
+        #     self.write_log(f"Completion: \n\n{completion}")
+
+        dev_data_logger.capture(
+            "tokens_generated",
+            {"model": self.model, "tokens": self.count_tokens(completion)},
+        )
+        posthog_logger.capture_event(
+            "tokens_generated",
+            {"model": self.model, "tokens": self.count_tokens(completion)},
+        )
+
+    async def complete(
+        self,
+        prompt: str,
+        raw: bool = False,
+        model: str = None,
+        temperature: float = None,
+        top_p: float = None,
+        top_k: int = None,
+        presence_penalty: float = None,
+        frequency_penalty: float = None,
+        stop: Optional[List[str]] = None,
+        max_tokens: Optional[int] = None,
+        functions: Optional[List[Any]] = None,
+        log: bool = True,
+    ) -> str:
+        """Yield completion response, either streamed or not."""
+        options = CompletionOptions(
+            model=model or self.model,
+            temperature=temperature or self.temperature,
+            top_p=top_p or self.top_p,
+            top_k=top_k or self.top_k,
+            presence_penalty=presence_penalty or self.presence_penalty,
+            frequency_penalty=frequency_penalty or self.frequency_penalty,
+            stop=stop or self.stop_tokens,
+            max_tokens=max_tokens,
+            functions=functions,
+        )
+
+        prompt = prune_raw_prompt_from_top(
+            self.model, self.context_length, prompt, options.max_tokens
+        )
+
+        if not raw:
+            prompt = self.template_prompt_like_messages(prompt)
+
+        if log:
+            self.write_log(prompt)
+
+        completion = await self._complete(prompt=prompt, options=options)
+
+        # if log:
+        #     self.write_log(f"Completion: \n\n{completion}")
+
+        dev_data_logger.capture(
+            "tokens_generated",
+            {"model": self.model, "tokens": self.count_tokens(completion)},
+        )
+        posthog_logger.capture_event(
+            "tokens_generated",
+            {"model": self.model, "tokens": self.count_tokens(completion)},
+        )
+
+        return completion
+
+    async def stream_chat(
+        self,
+        messages: List[ChatMessage],
+        model: str = None,
+        temperature: float = None,
+        top_p: float = None,
+        top_k: int = None,
+        presence_penalty: float = None,
+        frequency_penalty: float = None,
+        stop: Optional[List[str]] = None,
+        max_tokens: Optional[int] = None,
+        functions: Optional[List[Any]] = None,
+        log: bool = True,
+    ) -> Generator[Union[Any, List, Dict], None, None]:
+        """Yield completion response, either streamed or not."""
+        options = CompletionOptions(
+            model=model or self.model,
+            temperature=temperature or self.temperature,
+            top_p=top_p or self.top_p,
+            top_k=top_k or self.top_k,
+            presence_penalty=presence_penalty or self.presence_penalty,
+            frequency_penalty=frequency_penalty or self.frequency_penalty,
+            stop=stop or self.stop_tokens,
+            max_tokens=max_tokens,
+            functions=functions,
+        )
+
+        messages = self.compile_chat_messages(
+            options=options, msgs=messages, functions=functions
+        )
+        if self.template_messages is not None:
+            prompt = self.template_messages(messages)
+        else:
+            prompt = format_chat_messages(messages)
+
+        if log:
+            self.write_log(prompt)
+
+        completion = ""
+
+        # Use the template_messages function if it exists and do a raw completion
+        if self.template_messages is None:
+            async for chunk in self._stream_chat(messages=messages, options=options):
+                yield chunk
+                if "content" in chunk:
+                    completion += chunk["content"]
+        else:
+            async for chunk in self._stream_complete(prompt=prompt, options=options):
+                yield {"role": "assistant", "content": chunk}
+                completion += chunk
+
+        # if log:
+        #     self.write_log(f"Completion: \n\n{completion}")
+
+        dev_data_logger.capture(
+            "tokens_generated",
+            {"model": self.model, "tokens": self.count_tokens(completion)},
+        )
+        posthog_logger.capture_event(
+            "tokens_generated",
+            {"model": self.model, "tokens": self.count_tokens(completion)},
+        )
+
+    def _stream_complete(
+        self, prompt, options: CompletionOptions
+    ) -> Generator[str, None, None]:
+        """Stream the completion through generator."""
+        raise NotImplementedError
+
+    async def _complete(
+        self, prompt: str, options: CompletionOptions
+    ) -> Coroutine[Any, Any, str]:
+        """Return the completion of the text with the given temperature."""
+        completion = ""
+        async for chunk in self._stream_complete(prompt=prompt, options=options):
+            completion += chunk
+        return completion
+
+    async def _stream_chat(
+        self, messages: List[ChatMessage], options: CompletionOptions
+    ) -> Generator[Union[Any, List, Dict], None, None]:
+        """Stream the chat through generator."""
+        if self.template_messages is None:
+            raise NotImplementedError(
+                "You must either implement template_messages or _stream_chat"
+            )
+
+        async for chunk in self._stream_complete(
+            prompt=self.template_messages(messages), options=options
+        ):
+            yield {"role": "assistant", "content": chunk}
+
+    def count_tokens(self, text: str):
+        """Return the number of tokens in the given text."""
+        return count_tokens(self.model, text)
diff --git a/server/continuedev/libs/llm/ggml.py b/server/continuedev/libs/llm/ggml.py
new file mode 100644
index 00000000..55d580a8
--- /dev/null
+++ b/server/continuedev/libs/llm/ggml.py
@@ -0,0 +1,226 @@
+import json
+from typing import Any, Callable, Coroutine, Dict, List, Literal, Optional
+
+from pydantic import Field
+
+from ...core.main import ChatMessage
+from ..util.logging import logger
+from .base import LLM, CompletionOptions
+from .openai import CHAT_MODELS
+from .prompts.chat import llama2_template_messages
+from .prompts.edit import simplified_edit_prompt
+
+
+class GGML(LLM):
+    """
+    See our [5 minute quickstart](https://github.com/continuedev/ggml-server-example) to run any model locally with ggml. While these models don't yet perform as well, they are free, entirely private, and run offline.
+
+    Once the model is running on localhost:8000, change `~/.continue/config.py` to look like this:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.libs.llm.ggml import GGML
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=GGML(
+                max_context_length=2048,
+                server_url="http://localhost:8000")
+        )
+    )
+    ```
+    """
+
+    server_url: str = Field(
+        "http://localhost:8000",
+        description="URL of the OpenAI-compatible server where the model is being served",
+    )
+    model: str = Field(
+        "ggml", description="The name of the model to use (optional for the GGML class)"
+    )
+    
+    api_base: Optional[str] = Field(None, description="OpenAI API base URL.")
+
+    api_type: Optional[Literal["azure", "openai"]] = Field(
+        None, description="OpenAI API type."
+    )
+
+    api_version: Optional[str] = Field(
+        None, description="OpenAI API version. For use with Azure OpenAI Service."
+    )
+
+    engine: Optional[str] = Field(
+        None, description="OpenAI engine. For use with Azure OpenAI Service."
+    )
+
+    template_messages: Optional[
+        Callable[[List[Dict[str, str]]], str]
+    ] = llama2_template_messages
+
+    prompt_templates = {
+        "edit": simplified_edit_prompt,
+    }
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def get_headers(self):
+        headers = {
+            "Content-Type": "application/json",
+        }
+        if self.api_key is not None:
+            if self.api_type == "azure":
+                headers["api-key"] = self.api_key
+            else:
+                headers["Authorization"] = f"Bearer {self.api_key}"
+
+        return headers
+    
+    def get_full_server_url(self, endpoint: str):
+        endpoint = endpoint.lstrip("/").rstrip("/")
+
+        if self.api_type == "azure":
+            if self.engine is None or self.api_version is None or self.api_base is None:
+                raise Exception(
+                    "For Azure OpenAI Service, you must specify engine, api_version, and api_base."
+                )
+            
+            return f"{self.api_base}/openai/deployments/{self.engine}/{endpoint}?api-version={self.api_version}"
+        else:
+            return f"{self.server_url}/v1/{endpoint}"
+
+    async def _raw_stream_complete(self, prompt, options):
+        args = self.collect_args(options)
+
+        async with self.create_client_session() as client_session:
+            async with client_session.post(
+                self.get_full_server_url(endpoint="completions"),
+                json={
+                    "prompt": prompt,
+                    "stream": True,
+                    **args,
+                },
+                headers=self.get_headers(),
+                proxy=self.proxy,
+            ) as resp:
+                if resp.status != 200:
+                    raise Exception(
+                        f"Error calling /chat/completions endpoint: {resp.status}"
+                    )
+
+                async for line in resp.content.iter_any():
+                    if line:
+                        chunks = line.decode("utf-8")
+                        for chunk in chunks.split("\n"):
+                            if (
+                                chunk.startswith(": ping - ")
+                                or chunk.startswith("data: [DONE]")
+                                or chunk.strip() == ""
+                            ):
+                                continue
+                            elif chunk.startswith("data: "):
+                                chunk = chunk[6:]
+                            try:
+                                j = json.loads(chunk)
+                            except Exception:
+                                continue
+                            if (
+                                "choices" in j
+                                and len(j["choices"]) > 0
+                                and "text" in j["choices"][0]
+                            ):
+                                yield j["choices"][0]["text"]
+
+    async def _stream_chat(self, messages: List[ChatMessage], options):
+        args = self.collect_args(options)
+
+        async def generator():
+            async with self.create_client_session() as client_session:
+                async with client_session.post(
+                    self.get_full_server_url(endpoint="chat/completions"),
+                    json={"messages": messages, "stream": True, **args},
+                    headers=self.get_headers(),
+                    proxy=self.proxy,
+                ) as resp:
+                    if resp.status != 200:
+                        raise Exception(
+                            f"Error calling /chat/completions endpoint: {resp.status}"
+                        )
+                    
+                    async for line, end in resp.content.iter_chunks():
+                        json_chunk = line.decode("utf-8")
+                        chunks = json_chunk.split("\n")
+                        for chunk in chunks:
+                            if (
+                                chunk.strip() == ""
+                                or json_chunk.startswith(": ping - ")
+                                or json_chunk.startswith("data: [DONE]")
+                            ):
+                                continue
+                            try:
+                                yield json.loads(chunk[6:])["choices"][0]["delta"]
+                            except:
+                                pass
+
+        # Because quite often the first attempt fails, and it works thereafter
+        try:
+            async for chunk in generator():
+                yield chunk
+        except Exception as e:
+            logger.warning(f"Error calling /chat/completions endpoint: {e}")
+            async for chunk in generator():
+                yield chunk
+
+    async def _raw_complete(self, prompt: str, options) -> Coroutine[Any, Any, str]:
+        args = self.collect_args(options)
+
+        async with self.create_client_session() as client_session:
+            async with client_session.post(
+                self.get_full_server_url(endpoint="completions"),
+                json={
+                    "prompt": prompt,
+                    **args,
+                },
+                headers=self.get_headers(),
+                proxy=self.proxy,
+            ) as resp:
+                if resp.status != 200:
+                    raise Exception(
+                        f"Error calling /chat/completions endpoint: {resp.status}"
+                    )
+
+                text = await resp.text()
+                try:
+                    completion = json.loads(text)["choices"][0]["text"]
+                    return completion
+                except Exception as e:
+                    raise Exception(
+                        f"Error calling /completion endpoint: {e}\n\nResponse text: {text}"
+                    )
+
+    async def _complete(self, prompt: str, options: CompletionOptions):
+        completion = ""
+        if self.model in CHAT_MODELS:
+            async for chunk in self._stream_chat(
+                [{"role": "user", "content": prompt}], options
+            ):
+                if "content" in chunk:
+                    completion += chunk["content"]
+
+        else:
+            async for chunk in self._raw_stream_complete(prompt, options):
+                completion += chunk
+
+        return completion
+
+    async def _stream_complete(self, prompt, options: CompletionOptions):
+        if self.model in CHAT_MODELS:
+            async for chunk in self._stream_chat(
+                [{"role": "user", "content": prompt}], options
+            ):
+                if "content" in chunk:
+                    yield chunk["content"]
+
+        else:
+            async for chunk in self._raw_stream_complete(prompt, options):
+                yield chunk
diff --git a/server/continuedev/libs/llm/google_palm_api.py b/server/continuedev/libs/llm/google_palm_api.py
new file mode 100644
index 00000000..3379fefe
--- /dev/null
+++ b/server/continuedev/libs/llm/google_palm_api.py
@@ -0,0 +1,50 @@
+from typing import List
+
+import requests
+from pydantic import Field
+
+from ...core.main import ChatMessage
+from .base import LLM
+
+
+class GooglePaLMAPI(LLM):
+    """
+    The Google PaLM API is currently in public preview, so production applications are not supported yet. However, you can [create an API key in Google MakerSuite](https://makersuite.google.com/u/2/app/apikey) and begin trying out the `chat-bison-001` model. Change `~/.continue/config.py` to look like this:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.core.models import Models
+    from continuedev.libs.llm.hf_inference_api import GooglePaLMAPI
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=GooglePaLMAPI(
+                model="chat-bison-001"
+                api_key="<MAKERSUITE_API_KEY>",
+        )
+    )
+    ```
+    """
+
+    api_key: str = Field(..., description="Google PaLM API key")
+
+    model: str = "chat-bison-001"
+
+    async def _stream_complete(self, prompt, options):
+        api_url = f"https://generativelanguage.googleapis.com/v1beta2/models/{self.model}:generateMessage?key={self.api_key}"
+        body = {"prompt": {"messages": [{"content": prompt}]}}
+        response = requests.post(api_url, json=body)
+        yield response.json()["candidates"][0]["content"]
+
+    async def _stream_chat(self, messages: List[ChatMessage], options):
+        msg_lst = []
+        for message in messages:
+            msg_lst.append({"content": message["content"]})
+
+        api_url = f"https://generativelanguage.googleapis.com/v1beta2/models/{self.model}:generateMessage?key={self.api_key}"
+        body = {"prompt": {"messages": msg_lst}}
+        response = requests.post(api_url, json=body)
+        yield {
+            "content": response.json()["candidates"][0]["content"],
+            "role": "assistant",
+        }
diff --git a/server/continuedev/libs/llm/hf_inference_api.py b/server/continuedev/libs/llm/hf_inference_api.py
new file mode 100644
index 00000000..990ec7c8
--- /dev/null
+++ b/server/continuedev/libs/llm/hf_inference_api.py
@@ -0,0 +1,78 @@
+from typing import Callable, Dict, List, Union
+
+from huggingface_hub import InferenceClient
+from pydantic import Field
+
+from .base import LLM, CompletionOptions
+from .prompts.chat import llama2_template_messages
+from .prompts.edit import simplified_edit_prompt
+
+
+class HuggingFaceInferenceAPI(LLM):
+    """
+    Hugging Face Inference API is a great option for newly released language models. Sign up for an account and add billing [here](https://huggingface.co/settings/billing), access the Inference Endpoints [here](https://ui.endpoints.huggingface.co), click on “New endpoint”, and fill out the form (e.g. select a model like [WizardCoder-Python-34B-V1.0](https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0)), and then deploy your model by clicking “Create Endpoint”. Change `~/.continue/config.py` to look like this:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.core.models import Models
+    from continuedev.libs.llm.hf_inference_api import HuggingFaceInferenceAPI
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=HuggingFaceInferenceAPI(
+                endpoint_url="<INFERENCE_API_ENDPOINT_URL>",
+                hf_token="<HUGGING_FACE_TOKEN>",
+        )
+    )
+    ```
+    """
+
+    model: str = Field(
+        "Hugging Face Inference API",
+        description="The name of the model to use (optional for the HuggingFaceInferenceAPI class)",
+    )
+    hf_token: str = Field(..., description="Your Hugging Face API token")
+    endpoint_url: str = Field(
+        None, description="Your Hugging Face Inference API endpoint URL"
+    )
+
+    template_messages: Union[
+        Callable[[List[Dict[str, str]]], str], None
+    ] = llama2_template_messages
+
+    prompt_templates = {
+        "edit": simplified_edit_prompt,
+    }
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def collect_args(self, options: CompletionOptions):
+        options.stop = None
+        args = super().collect_args(options)
+
+        if "max_tokens" in args:
+            args["max_new_tokens"] = args["max_tokens"]
+            del args["max_tokens"]
+        if "stop" in args:
+            args["stop_sequences"] = args["stop"]
+            del args["stop"]
+
+        return args
+
+    async def _stream_complete(self, prompt, options):
+        args = self.collect_args(options)
+
+        client = InferenceClient(self.endpoint_url, token=self.hf_token)
+
+        stream = client.text_generation(prompt, stream=True, details=True, **args)
+
+        for r in stream:
+            # skip special tokens
+            if r.token.special:
+                continue
+            # stop if we encounter a stop sequence
+            if options.stop is not None:
+                if r.token.text in options.stop:
+                    break
+            yield r.token.text
diff --git a/server/continuedev/libs/llm/hf_tgi.py b/server/continuedev/libs/llm/hf_tgi.py
new file mode 100644
index 00000000..62458db4
--- /dev/null
+++ b/server/continuedev/libs/llm/hf_tgi.py
@@ -0,0 +1,65 @@
+import json
+from typing import Any, Callable, List
+
+from pydantic import Field
+
+from ...core.main import ChatMessage
+from .base import LLM, CompletionOptions
+from .prompts.chat import llama2_template_messages
+from .prompts.edit import simplified_edit_prompt
+
+
+class HuggingFaceTGI(LLM):
+    model: str = "huggingface-tgi"
+    server_url: str = Field(
+        "http://localhost:8080", description="URL of your TGI server"
+    )
+
+    template_messages: Callable[[List[ChatMessage]], str] = llama2_template_messages
+
+    prompt_templates = {
+        "edit": simplified_edit_prompt,
+    }
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def collect_args(self, options: CompletionOptions) -> Any:
+        args = super().collect_args(options)
+        args = {**args, "max_new_tokens": args.get("max_tokens", 1024), "best_of": 1}
+        args.pop("max_tokens", None)
+        args.pop("model", None)
+        args.pop("functions", None)
+        return args
+
+    async def _stream_complete(self, prompt, options):
+        args = self.collect_args(options)
+
+        async with self.create_client_session() as client_session:
+            async with client_session.post(
+                f"{self.server_url}/generate_stream",
+                json={"inputs": prompt, "parameters": args},
+                headers={"Content-Type": "application/json"},
+                proxy=self.proxy,
+            ) as resp:
+                async for line in resp.content.iter_any():
+                    if line:
+                        text = line.decode("utf-8")
+                        chunks = text.split("\n")
+
+                        for chunk in chunks:
+                            if chunk.startswith("data: "):
+                                chunk = chunk[len("data: ") :]
+                            elif chunk.startswith("data:"):
+                                chunk = chunk[len("data:") :]
+
+                            if chunk.strip() == "":
+                                continue
+
+                            try:
+                                json_chunk = json.loads(chunk)
+                            except Exception as e:
+                                print(f"Error parsing JSON: {e}")
+                                continue
+
+                            yield json_chunk["token"]["text"]
diff --git a/server/continuedev/libs/llm/hugging_face.py b/server/continuedev/libs/llm/hugging_face.py
new file mode 100644
index 00000000..c2e934c0
--- /dev/null
+++ b/server/continuedev/libs/llm/hugging_face.py
@@ -0,0 +1,19 @@
+# TODO: This class is far out of date
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from .llm import LLM
+
+
+class HuggingFace(LLM):
+    def __init__(self, model_path: str = "Salesforce/codegen-2B-mono"):
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModelForCausalLM.from_pretrained(model_path)
+
+    def complete(self, prompt: str, **kwargs):
+        args = {"max_tokens": 100}
+        args.update(kwargs)
+        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        generated_ids = self.model.generate(input_ids, max_length=args["max_tokens"])
+        return self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
diff --git a/server/continuedev/libs/llm/llamacpp.py b/server/continuedev/libs/llm/llamacpp.py
new file mode 100644
index 00000000..bc856a52
--- /dev/null
+++ b/server/continuedev/libs/llm/llamacpp.py
@@ -0,0 +1,86 @@
+import json
+from typing import Any, Callable, Dict
+
+from pydantic import Field
+
+from .base import LLM
+from .prompts.chat import llama2_template_messages
+from .prompts.edit import simplified_edit_prompt
+
+
+class LlamaCpp(LLM):
+    """
+    Run the llama.cpp server binary to start the API server. If running on a remote server, be sure to set host to 0.0.0.0:
+
+    ```shell
+    .\server.exe -c 4096 --host 0.0.0.0 -t 16 --mlock -m models\meta\llama\codellama-7b-instruct.Q8_0.gguf
+    ```
+
+    After it's up and running, change `~/.continue/config.py` to look like this:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.libs.llm.llamacpp import LlamaCpp
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=LlamaCpp(
+                max_context_length=4096,
+                server_url="http://localhost:8080")
+        )
+    )
+    ```
+    """
+
+    model: str = "llamacpp"
+    server_url: str = Field("http://localhost:8080", description="URL of the server")
+
+    llama_cpp_args: Dict[str, Any] = Field(
+        {"stop": ["[INST]"]},
+        description="A list of additional arguments to pass to llama.cpp. See [here](https://github.com/ggerganov/llama.cpp/tree/master/examples/server#api-endpoints) for the complete catalog of options.",
+    )
+
+    template_messages: Callable = llama2_template_messages
+    prompt_templates = {
+        "edit": simplified_edit_prompt,
+    }
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def collect_args(self, options) -> Any:
+        args = super().collect_args(options)
+        if "max_tokens" in args:
+            args["n_predict"] = args["max_tokens"]
+            del args["max_tokens"]
+        if "frequency_penalty" in args:
+            del args["frequency_penalty"]
+        if "presence_penalty" in args:
+            del args["presence_penalty"]
+
+        for k, v in self.llama_cpp_args.items():
+            if k not in args:
+                args[k] = v
+
+        return args
+
+    async def _stream_complete(self, prompt, options):
+        args = self.collect_args(options)
+        headers = {"Content-Type": "application/json"}
+
+        async def server_generator():
+            async with self.create_client_session() as client_session:
+                async with client_session.post(
+                    f"{self.server_url}/completion",
+                    json={"prompt": prompt, "stream": True, **args},
+                    headers=headers,
+                    proxy=self.proxy,
+                ) as resp:
+                    async for line in resp.content:
+                        content = line.decode("utf-8")
+                        if content.strip() == "":
+                            continue
+                        yield json.loads(content[6:])["content"]
+
+        async for chunk in server_generator():
+            yield chunk
diff --git a/server/continuedev/libs/llm/ollama.py b/server/continuedev/libs/llm/ollama.py
new file mode 100644
index 00000000..82cbc852
--- /dev/null
+++ b/server/continuedev/libs/llm/ollama.py
@@ -0,0 +1,106 @@
+import json
+from typing import Callable
+
+import aiohttp
+from pydantic import Field
+
+from ...core.main import ContinueCustomException
+from ..util.logging import logger
+from .base import LLM
+from .prompts.chat import llama2_template_messages
+from .prompts.edit import simplified_edit_prompt
+
+
+class Ollama(LLM):
+    """
+    [Ollama](https://ollama.ai/) is an application for Mac and Linux that makes it easy to locally run open-source models, including Llama-2. Download the app from the website, and it will walk you through setup in a couple of minutes. You can also read more in their [README](https://github.com/jmorganca/ollama). Continue can then be configured to use the `Ollama` LLM class:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.libs.llm.ollama import Ollama
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=Ollama(model="llama2")
+        )
+    )
+    ```
+    """
+
+    model: str = "llama2"
+    server_url: str = Field(
+        "http://localhost:11434", description="URL of the Ollama server"
+    )
+
+    _client_session: aiohttp.ClientSession = None
+
+    template_messages: Callable = llama2_template_messages
+
+    prompt_templates = {
+        "edit": simplified_edit_prompt,
+    }
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    async def start(self, **kwargs):
+        await super().start(**kwargs)
+        self._client_session = self.create_client_session()
+        try:
+            async with self._client_session.post(
+                f"{self.server_url}/api/generate",
+                proxy=self.proxy,
+                json={
+                    "prompt": "",
+                    "model": self.model,
+                },
+            ) as _:
+                pass
+        except Exception as e:
+            logger.warning(f"Error pre-loading Ollama model: {e}")
+
+    async def stop(self):
+        await self._client_session.close()
+
+    async def get_downloaded_models(self):
+        async with self._client_session.get(
+            f"{self.server_url}/api/tags",
+            proxy=self.proxy,
+        ) as resp:
+            js_data = await resp.json()
+            return list(map(lambda x: x["name"], js_data["models"]))
+
+    async def _stream_complete(self, prompt, options):
+        async with self._client_session.post(
+            f"{self.server_url}/api/generate",
+            json={
+                "template": prompt,
+                "model": self.model,
+                "system": self.system_message,
+                "options": {"temperature": options.temperature},
+            },
+            proxy=self.proxy,
+        ) as resp:
+            if resp.status == 400:
+                txt = await resp.text()
+                extra_msg = ""
+                if "no such file" in txt:
+                    extra_msg = f"\n\nThis means that the model '{self.model}' is not downloaded.\n\nYou have the following models downloaded: {', '.join(await self.get_downloaded_models())}.\n\nTo download this model, run `ollama run {self.model}` in your terminal."
+                raise ContinueCustomException(
+                    f"Ollama returned an error: {txt}{extra_msg}",
+                    "Invalid request to Ollama",
+                )
+            elif resp.status != 200:
+                raise ContinueCustomException(
+                    f"Ollama returned an error: {await resp.text()}",
+                    "Invalid request to Ollama",
+                )
+            async for line in resp.content.iter_any():
+                if line:
+                    json_chunk = line.decode("utf-8")
+                    chunks = json_chunk.split("\n")
+                    for chunk in chunks:
+                        if chunk.strip() != "":
+                            j = json.loads(chunk)
+                            if "response" in j:
+                                yield j["response"]
diff --git a/server/continuedev/libs/llm/openai.py b/server/continuedev/libs/llm/openai.py
new file mode 100644
index 00000000..ba29279b
--- /dev/null
+++ b/server/continuedev/libs/llm/openai.py
@@ -0,0 +1,156 @@
+from typing import Callable, List, Literal, Optional
+
+import certifi
+import openai
+from pydantic import Field
+
+from ...core.main import ChatMessage
+from .base import LLM
+
+CHAT_MODELS = {
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-16k",
+    "gpt-4",
+    "gpt-3.5-turbo-0613",
+    "gpt-4-32k",
+}
+MAX_TOKENS_FOR_MODEL = {
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-0613": 4096,
+    "gpt-3.5-turbo-16k": 16_384,
+    "gpt-4": 8192,
+    "gpt-35-turbo-16k": 16_384,
+    "gpt-35-turbo-0613": 4096,
+    "gpt-35-turbo": 4096,
+    "gpt-4-32k": 32_768,
+}
+
+
+class OpenAI(LLM):
+    """
+    The OpenAI class can be used to access OpenAI models like gpt-4 and gpt-3.5-turbo.
+
+    If you are locally serving a model that uses an OpenAI-compatible server, you can simply change the `api_base` in the `OpenAI` class like this:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.libs.llm.openai import OpenAI
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=OpenAI(
+                api_key="EMPTY",
+                model="<MODEL_NAME>",
+                api_base="http://localhost:8000", # change to your server
+            )
+        )
+    )
+    ```
+
+    Options for serving models locally with an OpenAI-compatible server include:
+
+    - [text-gen-webui](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai#setup--installation)
+    - [FastChat](https://github.com/lm-sys/FastChat/blob/main/docs/openai_api.md)
+    - [LocalAI](https://localai.io/basics/getting_started/)
+    - [llama-cpp-python](https://github.com/abetlen/llama-cpp-python#web-server)
+    """
+
+    api_key: str = Field(
+        ...,
+        description="OpenAI API key",
+    )
+
+    proxy: Optional[str] = Field(None, description="Proxy URL to use for requests.")
+
+    api_base: Optional[str] = Field(None, description="OpenAI API base URL.")
+
+    api_type: Optional[Literal["azure", "openai"]] = Field(
+        None, description="OpenAI API type."
+    )
+
+    api_version: Optional[str] = Field(
+        None, description="OpenAI API version. For use with Azure OpenAI Service."
+    )
+
+    engine: Optional[str] = Field(
+        None, description="OpenAI engine. For use with Azure OpenAI Service."
+    )
+
+    async def start(
+        self, unique_id: Optional[str] = None, write_log: Callable[[str], None] = None
+    ):
+        await super().start(write_log=write_log, unique_id=unique_id)
+
+        if self.context_length is None:
+            self.context_length = MAX_TOKENS_FOR_MODEL.get(self.model, 4096)
+
+        openai.api_key = self.api_key
+        if self.api_type is not None:
+            openai.api_type = self.api_type
+        if self.api_base is not None:
+            openai.api_base = self.api_base
+        if self.api_version is not None:
+            openai.api_version = self.api_version
+
+        if self.verify_ssl is not None and self.verify_ssl is False:
+            openai.verify_ssl_certs = False
+
+        if self.proxy is not None:
+            openai.proxy = self.proxy
+
+        openai.ca_bundle_path = self.ca_bundle_path or certifi.where()
+
+    def collect_args(self, options):
+        args = super().collect_args(options)
+        if self.engine is not None:
+            args["engine"] = self.engine
+
+        if not args["model"].endswith("0613") and "functions" in args:
+            del args["functions"]
+
+        return args
+
+    async def _stream_complete(self, prompt, options):
+        args = self.collect_args(options)
+        args["stream"] = True
+
+        if args["model"] in CHAT_MODELS:
+            async for chunk in await openai.ChatCompletion.acreate(
+                messages=[{"role": "user", "content": prompt}],
+                **args,
+                headers=self.headers,
+            ):
+                if len(chunk.choices) > 0 and "content" in chunk.choices[0].delta:
+                    yield chunk.choices[0].delta.content
+        else:
+            async for chunk in await openai.Completion.acreate(prompt=prompt, **args, headers=self.headers):
+                if len(chunk.choices) > 0:
+                    yield chunk.choices[0].text
+
+    async def _stream_chat(self, messages: List[ChatMessage], options):
+        args = self.collect_args(options)
+
+        async for chunk in await openai.ChatCompletion.acreate(
+            messages=messages,
+            stream=True,
+            **args,
+            headers=self.headers,
+        ):
+            if not hasattr(chunk, "choices") or len(chunk.choices) == 0:
+                continue
+            yield chunk.choices[0].delta
+
+    async def _complete(self, prompt: str, options):
+        args = self.collect_args(options)
+
+        if args["model"] in CHAT_MODELS:
+            resp = await openai.ChatCompletion.acreate(
+                messages=[{"role": "user", "content": prompt}],
+                **args,
+                headers=self.headers,
+            )
+            return resp.choices[0].message.content
+        else:
+            return (
+                (await openai.Completion.acreate(prompt=prompt, **args, headers=self.headers)).choices[0].text
+            )
diff --git a/server/continuedev/libs/llm/openai_free_trial.py b/server/continuedev/libs/llm/openai_free_trial.py
new file mode 100644
index 00000000..b6e707f9
--- /dev/null
+++ b/server/continuedev/libs/llm/openai_free_trial.py
@@ -0,0 +1,83 @@
+from typing import Callable, List, Optional
+
+from ...core.main import ChatMessage
+from .base import LLM
+from .openai import OpenAI
+from .proxy_server import ProxyServer
+
+
+class OpenAIFreeTrial(LLM):
+    """
+    With the `OpenAIFreeTrial` `LLM`, new users can try out Continue with GPT-4 using a proxy server that securely makes calls to OpenAI using our API key. Continue should just work the first time you install the extension in VS Code.
+
+    Once you are using Continue regularly though, you will need to add an OpenAI API key that has access to GPT-4 by following these steps:
+
+    1. Copy your API key from https://platform.openai.com/account/api-keys
+    2. Open `~/.continue/config.py`. You can do this by using the '/config' command in Continue
+    3. Change the default LLMs to look like this:
+
+    ```python title="~/.continue/config.py"
+    API_KEY = "<API_KEY>"
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=OpenAIFreeTrial(model="gpt-4", api_key=API_KEY),
+            summarize=OpenAIFreeTrial(model="gpt-3.5-turbo", api_key=API_KEY)
+        )
+    )
+    ```
+
+    The `OpenAIFreeTrial` class will automatically switch to using your API key instead of ours. If you'd like to explicitly use one or the other, you can use the `ProxyServer` or `OpenAI` classes instead.
+
+    These classes support any models available through the OpenAI API, assuming your API key has access, including "gpt-4", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", and "gpt-4-32k".
+    """
+
+    api_key: Optional[str] = None
+
+    llm: Optional[LLM] = None
+
+    def update_llm_properties(self):
+        if self.llm is not None:
+            self.llm.system_message = self.system_message
+
+    async def start(
+        self, write_log: Callable[[str], None] = None, unique_id: Optional[str] = None
+    ):
+        await super().start(write_log=write_log, unique_id=unique_id)
+        if self.api_key is None or self.api_key.strip() == "":
+            self.llm = ProxyServer(
+                model=self.model,
+                verify_ssl=self.verify_ssl,
+                ca_bundle_path=self.ca_bundle_path,
+            )
+        else:
+            self.llm = OpenAI(
+                api_key=self.api_key,
+                model=self.model,
+                verify_ssl=self.verify_ssl,
+                ca_bundle_path=self.ca_bundle_path,
+            )
+
+        await self.llm.start(write_log=write_log, unique_id=unique_id)
+
+    async def stop(self):
+        await self.llm.stop()
+
+    async def _complete(self, prompt: str, options):
+        self.update_llm_properties()
+        return await self.llm._complete(prompt, options)
+
+    async def _stream_complete(self, prompt, options):
+        self.update_llm_properties()
+        resp = self.llm._stream_complete(prompt, options)
+        async for item in resp:
+            yield item
+
+    async def _stream_chat(self, messages: List[ChatMessage], options):
+        self.update_llm_properties()
+        resp = self.llm._stream_chat(messages=messages, options=options)
+        async for item in resp:
+            yield item
+
+    def count_tokens(self, text: str):
+        return self.llm.count_tokens(text)
diff --git a/server/continuedev/libs/llm/prompt_utils.py b/server/continuedev/libs/llm/prompt_utils.py
new file mode 100644
index 00000000..930b5220
--- /dev/null
+++ b/server/continuedev/libs/llm/prompt_utils.py
@@ -0,0 +1,76 @@
+from typing import Dict, List, Union
+
+from ...models.filesystem import RangeInFileWithContents
+from ...models.filesystem_edit import FileEdit
+
+
+class MarkdownStyleEncoderDecoder:
+    # Filename -> the part of the file you care about
+    range_in_files: List[RangeInFileWithContents]
+
+    def __init__(self, range_in_files: List[RangeInFileWithContents]):
+        self.range_in_files = range_in_files
+
+    def encode(self) -> str:
+        return "\n\n".join(
+            [
+                f"File ({rif.filepath})\n```\n{rif.contents}\n```"
+                for rif in self.range_in_files
+            ]
+        )
+
+    def _suggestions_to_file_edits(self, suggestions: Dict[str, str]) -> List[FileEdit]:
+        file_edits: List[FileEdit] = []
+        for suggestion_filepath, suggestion in suggestions.items():
+            matching_rifs = list(
+                filter(lambda r: r.filepath == suggestion_filepath, self.range_in_files)
+            )
+            if len(matching_rifs) > 0:
+                range_in_file = matching_rifs[0]
+                file_edits.append(
+                    FileEdit(
+                        range=range_in_file.range,
+                        filepath=range_in_file.filepath,
+                        replacement=suggestion,
+                    )
+                )
+
+        return file_edits
+
+    def _decode_to_suggestions(self, completion: str) -> Dict[str, str]:
+        if len(self.range_in_files) == 0:
+            return {}
+
+        if "```" not in completion:
+            completion = "```\n" + completion + "\n```"
+        if completion.strip().splitlines()[0].strip() == "```":
+            first_filepath = self.range_in_files[0].filepath
+            completion = f"File ({first_filepath})\n" + completion
+
+        suggestions: Dict[str, str] = {}
+        current_file_lines: List[str] = []
+        current_filepath: Union[str, None] = None
+        last_was_file = False
+        inside_file = False
+        for line in completion.splitlines():
+            if line.strip().startswith("File ("):
+                last_was_file = True
+                current_filepath = line.strip()[6:-1]
+            elif last_was_file and line.startswith("```"):
+                last_was_file = False
+                inside_file = True
+            elif inside_file:
+                if line.startswith("```"):
+                    inside_file = False
+                    suggestions[current_filepath] = "\n".join(current_file_lines)
+                    current_file_lines = []
+                    current_filepath = None
+                else:
+                    current_file_lines.append(line)
+
+        return suggestions
+
+    def decode(self, completion: str) -> List[FileEdit]:
+        suggestions = self._decode_to_suggestions(completion)
+        file_edits = self._suggestions_to_file_edits(suggestions)
+        return file_edits
diff --git a/server/continuedev/libs/llm/prompts/chat.py b/server/continuedev/libs/llm/prompts/chat.py
new file mode 100644
index 00000000..036f1b1a
--- /dev/null
+++ b/server/continuedev/libs/llm/prompts/chat.py
@@ -0,0 +1,174 @@
+from textwrap import dedent
+from typing import Dict, List
+
+from anthropic import AI_PROMPT, HUMAN_PROMPT
+
+
+def anthropic_template_messages(messages: List[Dict[str, str]]) -> str:
+    prompt = ""
+
+    # Anthropic prompt must start with a Human turn
+    if (
+        len(messages) > 0
+        and messages[0]["role"] != "user"
+        and messages[0]["role"] != "system"
+    ):
+        prompt += f"{HUMAN_PROMPT} Hello."
+    for msg in messages:
+        prompt += f"{HUMAN_PROMPT if (msg['role'] == 'user' or msg['role'] == 'system') else AI_PROMPT} {msg['content']} "
+
+    prompt += AI_PROMPT
+    return prompt
+
+
+def template_alpaca_messages(msgs: List[Dict[str, str]]) -> str:
+    prompt = ""
+
+    if msgs[0]["role"] == "system":
+        prompt += f"{msgs[0]['content']}\n"
+        msgs.pop(0)
+
+    for msg in msgs:
+        prompt += "### Instruction:\n" if msg["role"] == "user" else "### Response:\n"
+        prompt += f"{msg['content']}\n"
+
+    prompt += "### Response:\n"
+
+    return prompt
+
+
+def raw_input_template(msgs: List[Dict[str, str]]) -> str:
+    return msgs[-1]["content"]
+
+
+SQL_CODER_DEFAULT_SCHEMA = """\
+CREATE TABLE products (
+  product_id INTEGER PRIMARY KEY, -- Unique ID for each product
+  name VARCHAR(50), -- Name of the product
+  price DECIMAL(10,2), -- Price of each unit of the product
+  quantity INTEGER  -- Current quantity in stock
+);
+
+CREATE TABLE customers (
+   customer_id INTEGER PRIMARY KEY, -- Unique ID for each customer
+   name VARCHAR(50), -- Name of the customer
+   address VARCHAR(100) -- Mailing address of the customer
+);
+
+CREATE TABLE salespeople (
+  salesperson_id INTEGER PRIMARY KEY, -- Unique ID for each salesperson
+  name VARCHAR(50), -- Name of the salesperson
+  region VARCHAR(50) -- Geographic sales region
+);
+
+CREATE TABLE sales (
+  sale_id INTEGER PRIMARY KEY, -- Unique ID for each sale
+  product_id INTEGER, -- ID of product sold
+  customer_id INTEGER,  -- ID of customer who made purchase
+  salesperson_id INTEGER, -- ID of salesperson who made the sale
+  sale_date DATE, -- Date the sale occurred
+  quantity INTEGER -- Quantity of product sold
+);
+
+CREATE TABLE product_suppliers (
+  supplier_id INTEGER PRIMARY KEY, -- Unique ID for each supplier
+  product_id INTEGER, -- Product ID supplied
+  supply_price DECIMAL(10,2) -- Unit price charged by supplier
+);
+
+-- sales.product_id can be joined with products.product_id
+-- sales.customer_id can be joined with customers.customer_id
+-- sales.salesperson_id can be joined with salespeople.salesperson_id
+-- product_suppliers.product_id can be joined with products.product_id
+"""
+
+
+def _sqlcoder_template_messages(
+    msgs: List[Dict[str, str]], schema: str = SQL_CODER_DEFAULT_SCHEMA
+) -> str:
+    question = msgs[-1]["content"]
+    return f"""\
+Your task is to convert a question into a SQL query, given a Postgres database schema.
+Adhere to these rules:
+- **Deliberately go through the question and database schema word by word** to appropriately answer the question
+- **Use Table Aliases** to prevent ambiguity. For example, `SELECT table1.col1, table2.col1 FROM table1 JOIN table2 ON table1.id = table2.id`.
+- When creating a ratio, always cast the numerator as float
+
+### Input:
+Generate a SQL query that answers the question `{question}`.
+This query will run on a database whose schema is represented in this string:
+{schema}
+
+### Response:
+Based on your instructions, here is the SQL query I have generated to answer the question `{question}`:
+```sql
+"""
+
+
+def sqlcoder_template_messages(schema: str = SQL_CODER_DEFAULT_SCHEMA):
+    if schema == "<MY_DATABASE_SCHEMA>" or schema == "":
+        schema = SQL_CODER_DEFAULT_SCHEMA
+
+    def fn(msgs):
+        return _sqlcoder_template_messages(msgs, schema=schema)
+
+    fn.__name__ = "sqlcoder_template_messages"
+    return fn
+
+
+def llama2_template_messages(msgs: List[Dict[str, str]]) -> str:
+    if len(msgs) == 0:
+        return ""
+
+    if msgs[0]["role"] == "assistant":
+        # These models aren't trained to handle assistant message coming first,
+        # and typically these are just introduction messages from Continue
+        msgs.pop(0)
+
+    prompt = ""
+    has_system = msgs[0]["role"] == "system"
+
+    if has_system and msgs[0]["content"].strip() == "":
+        has_system = False
+        msgs = msgs[1:]
+
+    if has_system:
+        system_message = dedent(
+            f"""\
+                <<SYS>>
+                {msgs[0]["content"]}
+                <</SYS>>
+                
+                """
+        )
+        if len(msgs) > 1:
+            prompt += f"[INST] {system_message}{msgs[1]['content']} [/INST]"
+        else:
+            prompt += f"[INST] {system_message} [/INST]"
+            return
+
+    for i in range(2 if has_system else 0, len(msgs)):
+        if msgs[i]["role"] == "user":
+            prompt += f"[INST] {msgs[i]['content']} [/INST]"
+        else:
+            prompt += msgs[i]["content"] + " "
+
+    return prompt
+
+
+def code_llama_template_messages(msgs: List[Dict[str, str]]) -> str:
+    return f"[INST] {msgs[-1]['content']}\n[/INST]"
+
+
+def extra_space_template_messages(msgs: List[Dict[str, str]]) -> str:
+    return f" {msgs[-1]['content']}"
+
+
+def code_llama_python_template_messages(msgs: List[Dict[str, str]]) -> str:
+    return dedent(
+        f"""\
+        [INST]
+        You are an expert Python programmer and personal assistant, here is your task: {msgs[-1]['content']}
+        Your answer should start with a [PYTHON] tag and end with a [/PYTHON] tag.
+        [/INST]"""
+    )
diff --git a/server/continuedev/libs/llm/prompts/edit.py b/server/continuedev/libs/llm/prompts/edit.py
new file mode 100644
index 00000000..eaa694c5
--- /dev/null
+++ b/server/continuedev/libs/llm/prompts/edit.py
@@ -0,0 +1,27 @@
+from textwrap import dedent
+
+simplified_edit_prompt = dedent(
+    """\
+            Consider the following code:
+            ```
+            {{{code_to_edit}}}
+            ```
+            Edit the code to perfectly satisfy the following user request:
+            {{{user_input}}}
+            Output nothing except for the code. No code block, no English explanation, no start/end tags."""
+)
+
+simplest_edit_prompt = dedent(
+    """\
+            Here is the code before editing:
+            ```
+            {{{code_to_edit}}}
+            ```
+
+            Here is the edit requested:
+            "{{{user_input}}}"
+            
+            Here is the code after editing:"""
+)
+
+codellama_infill_edit_prompt = "{{file_prefix}}<FILL>{{file_suffix}}"
diff --git a/server/continuedev/libs/llm/proxy_server.py b/server/continuedev/libs/llm/proxy_server.py
new file mode 100644
index 00000000..7c3462eb
--- /dev/null
+++ b/server/continuedev/libs/llm/proxy_server.py
@@ -0,0 +1,108 @@
+import json
+import traceback
+from typing import List
+
+import aiohttp
+
+from ...core.main import ChatMessage
+from ..util.telemetry import posthog_logger
+from .base import LLM
+
+# SERVER_URL = "http://127.0.0.1:8080"
+SERVER_URL = "https://proxy-server-l6vsfbzhba-uw.a.run.app"
+
+MAX_TOKENS_FOR_MODEL = {
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-0613": 4096,
+    "gpt-3.5-turbo-16k": 16384,
+    "gpt-4": 8192,
+}
+
+
+class ProxyServer(LLM):
+    _client_session: aiohttp.ClientSession
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    async def start(
+        self,
+        **kwargs,
+    ):
+        await super().start(**kwargs)
+        self._client_session = self.create_client_session()
+
+        self.context_length = MAX_TOKENS_FOR_MODEL[self.model]
+
+    async def stop(self):
+        await self._client_session.close()
+
+    def get_headers(self):
+        return {"unique_id": self.unique_id}
+
+    async def _complete(self, prompt: str, options):
+        args = self.collect_args(options)
+
+        async with self._client_session.post(
+            f"{SERVER_URL}/complete",
+            json={"messages": [{"role": "user", "content": prompt}], **args},
+            headers=self.get_headers(),
+            proxy=self.proxy,
+        ) as resp:
+            resp_text = await resp.text()
+            if resp.status != 200:
+                raise Exception(resp_text)
+
+            return resp_text
+
+    async def _stream_chat(self, messages: List[ChatMessage], options):
+        args = self.collect_args(options)
+        async with self._client_session.post(
+            f"{SERVER_URL}/stream_chat",
+            json={"messages": messages, **args},
+            headers=self.get_headers(),
+            proxy=self.proxy,
+        ) as resp:
+            if resp.status != 200:
+                raise Exception(await resp.text())
+
+            async for line in resp.content.iter_chunks():
+                if line[1]:
+                    try:
+                        json_chunk = line[0].decode("utf-8")
+                        json_chunk = "{}" if json_chunk == "" else json_chunk
+                        chunks = json_chunk.split("\n")
+                        for chunk in chunks:
+                            if chunk.strip() != "":
+                                loaded_chunk = json.loads(chunk)
+                                yield loaded_chunk
+
+                    except Exception as e:
+                        posthog_logger.capture_event(
+                            "proxy_server_parse_error",
+                            {
+                                "error_title": "Proxy server stream_chat parsing failed",
+                                "error_message": "\n".join(
+                                    traceback.format_exception(e)
+                                ),
+                            },
+                        )
+                else:
+                    break
+
+    async def _stream_complete(self, prompt, options):
+        args = self.collect_args(options)
+
+        async with self._client_session.post(
+            f"{SERVER_URL}/stream_complete",
+            json={"messages": [{"role": "user", "content": prompt}], **args},
+            headers=self.get_headers(),
+            proxy=self.proxy,
+        ) as resp:
+            if resp.status != 200:
+                raise Exception(await resp.text())
+
+            async for line in resp.content.iter_any():
+                if line:
+                    decoded_line = line.decode("utf-8")
+                    yield decoded_line
diff --git a/server/continuedev/libs/llm/queued.py b/server/continuedev/libs/llm/queued.py
new file mode 100644
index 00000000..2db749eb
--- /dev/null
+++ b/server/continuedev/libs/llm/queued.py
@@ -0,0 +1,77 @@
+import asyncio
+from typing import Any, List, Union
+
+from pydantic import Field
+
+from ...core.main import ChatMessage
+from .base import LLM, CompletionOptions
+
+
+class QueuedLLM(LLM):
+    """
+    QueuedLLM exists to make up for LLM servers that cannot handle multiple requests at once. It uses a lock to ensure that only one request is being processed at a time.
+
+    If you are already using another LLM class and are experiencing this problem, you can just wrap it with the QueuedLLM class like this:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.libs.llm.queued import QueuedLLM
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=QueuedLLM(llm=<OTHER_LLM_CLASS>)
+        )
+    )
+    ```
+    """
+
+    llm: LLM = Field(..., description="The LLM to wrap with a lock")
+    _lock: asyncio.Lock
+
+    model: str = "queued"
+
+    def dict(self, **kwargs):
+        return self.llm.dict(**kwargs)
+
+    async def start(self, *args, **kwargs):
+        await super().start(*args, **kwargs)
+        await self.llm.start(*args, **kwargs)
+        self._lock = asyncio.Lock()
+        self.model = self.llm.model
+        self.template_messages = self.llm.template_messages
+        self.prompt_templates = self.llm.prompt_templates
+        self.context_length = self.llm.context_length
+
+    async def stop(self):
+        await self.llm.stop()
+
+    def collect_args(self, options: CompletionOptions):
+        return self.llm.collect_args(options)
+
+    def compile_chat_messages(
+        self,
+        options: CompletionOptions,
+        msgs: List[ChatMessage],
+        functions: Union[List[Any], None] = None,
+    ):
+        return self.llm.compile_chat_messages(options, msgs, functions)
+
+    def template_prompt_like_messages(self, prompt: str) -> str:
+        return self.llm.template_prompt_like_messages(prompt)
+
+    async def _complete(self, prompt: str, options: CompletionOptions):
+        async with self._lock:
+            resp = await self.llm._complete(prompt, options)
+            return resp
+
+    async def _stream_complete(self, prompt: str, options: CompletionOptions):
+        async with self._lock:
+            async for chunk in self.llm._stream_complete(prompt, options):
+                yield chunk
+
+    async def _stream_chat(
+        self, messages: List[ChatMessage], options: CompletionOptions
+    ):
+        async with self._lock:
+            async for chunk in self.llm._stream_chat(messages, options):
+                yield chunk
diff --git a/server/continuedev/libs/llm/replicate.py b/server/continuedev/libs/llm/replicate.py
new file mode 100644
index 00000000..3423193b
--- /dev/null
+++ b/server/continuedev/libs/llm/replicate.py
@@ -0,0 +1,78 @@
+import concurrent.futures
+from typing import List
+
+import replicate
+from pydantic import Field
+
+from ...core.main import ChatMessage
+from .base import LLM
+from .prompts.edit import simplified_edit_prompt
+
+
+class ReplicateLLM(LLM):
+    """
+    Replicate is a great option for newly released language models or models that you've deployed through their platform. Sign up for an account [here](https://replicate.ai/), copy your API key, and then select any model from the [Replicate Streaming List](https://replicate.com/collections/streaming-language-models). Change `~/.continue/config.py` to look like this:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.core.models import Models
+    from continuedev.libs.llm.replicate import ReplicateLLM
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=ReplicateLLM(
+                model="replicate/codellama-13b-instruct:da5676342de1a5a335b848383af297f592b816b950a43d251a0a9edd0113604b",
+                api_key="my-replicate-api-key")
+        )
+    )
+    ```
+
+    If you don't specify the `model` parameter, it will default to `replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781`.
+    """
+
+    api_key: str = Field(..., description="Replicate API key")
+
+    model: str = "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781"
+
+    _client: replicate.Client = None
+
+    prompt_templates = {
+        "edit": simplified_edit_prompt,
+    }
+
+    async def start(self, **kwargs):
+        await super().start(**kwargs)
+        self._client = replicate.Client(api_token=self.api_key)
+
+    async def _complete(self, prompt: str, options):
+        def helper():
+            output = self._client.run(
+                self.model, input={"message": prompt, "prompt": prompt}
+            )
+            completion = ""
+            for item in output:
+                completion += item
+
+            return completion
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(helper)
+            completion = future.result()
+
+        return completion
+
+    async def _stream_complete(self, prompt, options):
+        for item in self._client.run(
+            self.model, input={"message": prompt, "prompt": prompt}
+        ):
+            yield item
+
+    async def _stream_chat(self, messages: List[ChatMessage], options):
+        for item in self._client.run(
+            self.model,
+            input={
+                "message": messages[-1]["content"],
+                "prompt": messages[-1]["content"],
+            },
+        ):
+            yield {"content": item, "role": "assistant"}
diff --git a/server/continuedev/libs/llm/text_gen_interface.py b/server/continuedev/libs/llm/text_gen_interface.py
new file mode 100644
index 00000000..225fd3b6
--- /dev/null
+++ b/server/continuedev/libs/llm/text_gen_interface.py
@@ -0,0 +1,114 @@
+import json
+from typing import Any, Callable, Dict, List, Union
+
+import websockets
+from pydantic import Field
+
+from ...core.main import ChatMessage
+from .base import LLM
+from .prompts.chat import llama2_template_messages
+from .prompts.edit import simplest_edit_prompt
+
+
+class TextGenUI(LLM):
+    """
+    TextGenUI is a comprehensive, open-source language model UI and local server. You can set it up with an OpenAI-compatible server plugin, but if for some reason that doesn't work, you can use this class like so:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.libs.llm.text_gen_interface import TextGenUI
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=TextGenUI(
+                model="<MODEL_NAME>",
+            )
+        )
+    )
+    ```
+    """
+
+    model: str = "text-gen-ui"
+    server_url: str = Field(
+        "http://localhost:5000", description="URL of your TextGenUI server"
+    )
+    streaming_url: str = Field(
+        "http://localhost:5005",
+        description="URL of your TextGenUI streaming server (separate from main server URL)",
+    )
+
+    prompt_templates = {
+        "edit": simplest_edit_prompt,
+    }
+
+    template_messages: Union[
+        Callable[[List[Dict[str, str]]], str], None
+    ] = llama2_template_messages
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def collect_args(self, options) -> Any:
+        args = super().collect_args(options)
+        args = {**args, "max_new_tokens": options.max_tokens}
+        args.pop("max_tokens", None)
+        return args
+
+    async def _stream_complete(self, prompt, options):
+        args = self.collect_args(options)
+
+        ws_url = f"{self.streaming_url.replace('http://', 'ws://').replace('https://', 'wss://')}"
+        payload = json.dumps({"prompt": prompt, "stream": True, **args})
+        async with websockets.connect(
+            f"{ws_url}/api/v1/stream", ping_interval=None
+        ) as websocket:
+            await websocket.send(payload)
+
+            while True:
+                incoming_data = await websocket.recv()
+                incoming_data = json.loads(incoming_data)
+
+                match incoming_data["event"]:
+                    case "text_stream":
+                        yield incoming_data["text"]
+                    case "stream_end":
+                        break
+
+    async def _stream_chat(self, messages: List[ChatMessage], options):
+        args = self.collect_args(options)
+
+        async def generator():
+            ws_url = f"{self.streaming_url.replace('http://', 'ws://').replace('https://', 'wss://')}"
+            history = list(map(lambda x: x["content"], messages))
+            payload = json.dumps(
+                {
+                    "user_input": messages[-1]["content"],
+                    "history": {"internal": [history], "visible": [history]},
+                    "stream": True,
+                    **args,
+                }
+            )
+            async with websockets.connect(
+                f"{ws_url}/api/v1/chat-stream", ping_interval=None
+            ) as websocket:
+                await websocket.send(payload)
+
+                prev = ""
+                while True:
+                    incoming_data = await websocket.recv()
+                    incoming_data = json.loads(incoming_data)
+
+                    match incoming_data["event"]:
+                        case "text_stream":
+                            visible = incoming_data["history"]["visible"][-1]
+                            if len(visible) > 0:
+                                yield {
+                                    "role": "assistant",
+                                    "content": visible[-1].replace(prev, ""),
+                                }
+                                prev = visible[-1]
+                        case "stream_end":
+                            break
+
+        async for chunk in generator():
+            yield chunk
diff --git a/server/continuedev/libs/llm/together.py b/server/continuedev/libs/llm/together.py
new file mode 100644
index 00000000..35b3a424
--- /dev/null
+++ b/server/continuedev/libs/llm/together.py
@@ -0,0 +1,125 @@
+import json
+from typing import Callable
+
+import aiohttp
+from pydantic import Field
+
+from ...core.main import ContinueCustomException
+from ..util.logging import logger
+from .base import LLM
+from .prompts.chat import llama2_template_messages
+from .prompts.edit import simplified_edit_prompt
+
+
+class TogetherLLM(LLM):
+    """
+    The Together API is a cloud platform for running large AI models. You can sign up [here](https://api.together.xyz/signup), copy your API key on the initial welcome screen, and then hit the play button on any model from the [Together Models list](https://docs.together.ai/docs/models-inference). Change `~/.continue/config.py` to look like this:
+
+    ```python title="~/.continue/config.py"
+    from continuedev.core.models import Models
+    from continuedev.libs.llm.together import TogetherLLM
+
+    config = ContinueConfig(
+        ...
+        models=Models(
+            default=TogetherLLM(
+                api_key="<API_KEY>",
+                model="togethercomputer/llama-2-13b-chat"
+            )
+        )
+    )
+    ```
+    """
+
+    api_key: str = Field(..., description="Together API key")
+
+    model: str = "togethercomputer/RedPajama-INCITE-7B-Instruct"
+    base_url: str = Field(
+        "https://api.together.xyz",
+        description="The base URL for your Together API instance",
+    )
+
+    _client_session: aiohttp.ClientSession = None
+
+    template_messages: Callable = llama2_template_messages
+
+    prompt_templates = {
+        "edit": simplified_edit_prompt,
+    }
+
+    async def start(self, **kwargs):
+        await super().start(**kwargs)
+        self._client_session = aiohttp.ClientSession(
+            connector=aiohttp.TCPConnector(verify_ssl=self.verify_ssl),
+            timeout=aiohttp.ClientTimeout(total=self.timeout),
+        )
+
+    async def stop(self):
+        await self._client_session.close()
+
+    async def _stream_complete(self, prompt, options):
+        args = self.collect_args(options)
+
+        async with self._client_session.post(
+            f"{self.base_url}/inference",
+            json={
+                "prompt": prompt,
+                "stream_tokens": True,
+                **args,
+            },
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            proxy=self.proxy,
+        ) as resp:
+            async for line in resp.content.iter_chunks():
+                if line[1]:
+                    json_chunk = line[0].decode("utf-8")
+                    if json_chunk.startswith(": ping - ") or json_chunk.startswith(
+                        "data: [DONE]"
+                    ):
+                        continue
+
+                    chunks = json_chunk.split("\n")
+                    for chunk in chunks:
+                        if chunk.strip() != "":
+                            if chunk.startswith("data: "):
+                                chunk = chunk[6:]
+                            if chunk == "[DONE]":
+                                break
+                            try:
+                                json_chunk = json.loads(chunk)
+                            except Exception as e:
+                                logger.warning(f"Invalid JSON chunk: {chunk}\n\n{e}")
+                                continue
+                            if "choices" in json_chunk:
+                                yield json_chunk["choices"][0]["text"]
+
+    async def _complete(self, prompt: str, options):
+        args = self.collect_args(options)
+
+        async with self._client_session.post(
+            f"{self.base_url}/inference",
+            json={"prompt": prompt, **args},
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            proxy=self.proxy,
+        ) as resp:
+            text = await resp.text()
+            j = json.loads(text)
+            try:
+                if "choices" not in j["output"]:
+                    raise Exception(text)
+                if "output" in j:
+                    return j["output"]["choices"][0]["text"]
+            except Exception as e:
+                j = await resp.json()
+                if "error" in j:
+                    if j["error"].startswith("invalid hexlify value"):
+                        raise ContinueCustomException(
+                            message=f"Invalid Together API key:\n\n{j['error']}",
+                            title="Together API Error",
+                        )
+                    else:
+                        raise ContinueCustomException(
+                            message=j["error"], title="Together API Error"
+                        )
+
+                raise e
author	Nate Sesti <33237525+sestinj@users.noreply.github.com>	2023-10-09 18:37:27 -0700
committer	GitHub <noreply@github.com>	2023-10-09 18:37:27 -0700
commit	f09150617ed2454f3074bcf93f53aae5ae637d40 (patch)
tree	5cfe614a64d921dfe58b049f426d67a8b832c71f /server/continuedev/libs/llm
parent	985304a213f620cdff3f8f65f74ed7e3b79be29d (diff)
download	sncontinue-f09150617ed2454f3074bcf93f53aae5ae637d40.tar.gz sncontinue-f09150617ed2454f3074bcf93f53aae5ae637d40.tar.bz2 sncontinue-f09150617ed2454f3074bcf93f53aae5ae637d40.zip