21 files changed, 41 insertions, 1346 deletions
diff --git a/server/config.py b/server/config.py
index 29c05a6c..0ac22cb4 100644
--- a/server/config.py
+++ b/server/config.py
@@ -6,7 +6,7 @@ See https://continue.dev/docs/customization to for documentation of the availabl
 
 from continuedev.core.models import Models
 from continuedev.core.config import CustomCommand, SlashCommand, ContinueConfig
-from continuedev.libs.llm import OpenAIFreeTrial
+from continuedev.libs.llm import Ollama
 
 from continuedev.plugins.context_providers import (
     DiffContextProvider,
@@ -26,8 +26,14 @@ from continuedev.plugins.steps.share_session import ShareSessionStep
 config = ContinueConfig(
     allow_anonymous_telemetry=True,
     models=Models(
-        default=OpenAIFreeTrial(api_key="", model="gpt-4"),
-        summarize=OpenAIFreeTrial(api_key="", model="gpt-3.5-turbo"),
+        default=Ollama(
+			title="CodeLlama-7b-Instruct",
+			model="codellama:7b-instruct"
+		),
+        summarize=Ollama(
+			title="CodeLlama-7b-Instruct",
+			model="codellama:7b-instruct"
+		)
     ),
     system_message=None,
     temperature=0.5,
diff --git a/server/continuedev/core/config.py b/server/continuedev/core/config.py
index 2bbb42cc..bf555b59 100644
--- a/server/continuedev/core/config.py
+++ b/server/continuedev/core/config.py
@@ -2,7 +2,7 @@ from typing import Dict, List, Optional, Type
 
 from pydantic import BaseModel, Field, validator
 
-from ..libs.llm.openai_free_trial import OpenAIFreeTrial
+from ..libs.llm import Ollama
 from .context import ContextProvider
 from .main import Policy, Step
 from .models import Models
@@ -48,8 +48,14 @@ class ContinueConfig(BaseModel):
     )
     models: Models = Field(
         Models(
-            default=OpenAIFreeTrial(model="gpt-4"),
-            summarize=OpenAIFreeTrial(model="gpt-3.5-turbo"),
+            default=Ollama(
+                title="CodeLlama-7b-Instruct",
+                model="codellama:7b-instruct"
+            ),
+            summarize=Ollama(
+                title="CodeLlama-7b-Instruct",
+                model="codellama:7b-instruct"
+            )
         ),
         description="Configuration for the models used by Continue. Read more about how to configure models in the documentation.",
     )
diff --git a/server/continuedev/core/models.py b/server/continuedev/core/models.py
index 21ebd8f6..c31177b9 100644
--- a/server/continuedev/core/models.py
+++ b/server/continuedev/core/models.py
@@ -2,18 +2,9 @@ from typing import List, Optional
 
 from pydantic import BaseModel
 
-from ..libs.llm.anthropic import AnthropicLLM
 from ..libs.llm.base import LLM
-from ..libs.llm.ggml import GGML
-from ..libs.llm.google_palm_api import GooglePaLMAPI
-from ..libs.llm.hf_inference_api import HuggingFaceInferenceAPI
-from ..libs.llm.hf_tgi import HuggingFaceTGI
 from ..libs.llm.llamacpp import LlamaCpp
 from ..libs.llm.ollama import Ollama
-from ..libs.llm.openai import OpenAI
-from ..libs.llm.openai_free_trial import OpenAIFreeTrial
-from ..libs.llm.replicate import ReplicateLLM
-from ..libs.llm.together import TogetherLLM
 
 
 class ContinueSDK(BaseModel):
@@ -30,32 +21,14 @@ ALL_MODEL_ROLES = [
 MODEL_CLASSES = {
     cls.__name__: cls
     for cls in [
-        OpenAI,
-        OpenAIFreeTrial,
-        GGML,
-        TogetherLLM,
-        AnthropicLLM,
-        ReplicateLLM,
         Ollama,
-        LlamaCpp,
-        HuggingFaceInferenceAPI,
-        HuggingFaceTGI,
-        GooglePaLMAPI,
+        LlamaCpp
     ]
 }
 
 MODEL_MODULE_NAMES = {
-    "OpenAI": "openai",
-    "OpenAIFreeTrial": "openai_free_trial",
-    "GGML": "ggml",
-    "TogetherLLM": "together",
-    "AnthropicLLM": "anthropic",
-    "ReplicateLLM": "replicate",
     "Ollama": "ollama",
-    "LlamaCpp": "llamacpp",
-    "HuggingFaceInferenceAPI": "hf_inference_api",
-    "HuggingFaceTGI": "hf_tgi",
-    "GooglePaLMAPI": "google_palm_api",
+    "LlamaCpp": "llamacpp"
 }
 
 
diff --git a/server/continuedev/core/steps.py b/server/continuedev/core/steps.py
index 5c20dd15..110a4457 100644
--- a/server/continuedev/core/steps.py
+++ b/server/continuedev/core/steps.py
@@ -5,7 +5,6 @@ from textwrap import dedent
 from typing import Coroutine, List, Optional, Union
 
 from ..libs.llm.base import LLM
-from ..libs.llm.openai_free_trial import OpenAIFreeTrial
 from ..libs.util.count_tokens import DEFAULT_MAX_TOKENS
 from ..libs.util.devdata import dev_data_logger
 from ..libs.util.strings import (
@@ -229,12 +228,6 @@ class DefaultModelEditCodeStep(Step):
             + max_tokens
         )
 
-        # If using 3.5 and overflows, upgrade to 3.5.16k
-        if model_to_use.model == "gpt-3.5-turbo":
-            if total_tokens > model_to_use.context_length:
-                model_to_use = OpenAIFreeTrial(model="gpt-3.5-turbo-0613")
-                await sdk.start_model(model_to_use)
-
         # Remove tokens from the end first, and then the start to clear space
         # This part finds the start and end lines
         full_file_contents_lst = full_file_contents.split("\n")
diff --git a/server/continuedev/libs/constants/default_config.py b/server/continuedev/libs/constants/default_config.py
index a007eef1..7cffacbc 100644
--- a/server/continuedev/libs/constants/default_config.py
+++ b/server/continuedev/libs/constants/default_config.py
@@ -7,7 +7,7 @@ See https://continue.dev/docs/customization to for documentation of the availabl
 
 from continuedev.core.models import Models
 from continuedev.core.config import CustomCommand, SlashCommand, ContinueConfig
-from continuedev.libs.llm import OpenAIFreeTrial
+from continuedev.libs.llm import Ollama
 
 from continuedev.plugins.context_providers import (
     DiffContextProvider,
@@ -27,8 +27,14 @@ from continuedev.plugins.steps.share_session import ShareSessionStep
 config = ContinueConfig(
     allow_anonymous_telemetry=True,
     models=Models(
-        default=OpenAIFreeTrial(api_key="", model="gpt-4"),
-        summarize=OpenAIFreeTrial(api_key="", model="gpt-3.5-turbo")
+        default=Ollama(
+			title="CodeLlama-7b-Instruct",
+			model="codellama:7b-instruct"
+		),
+        summarize=Ollama(
+			title="CodeLlama-7b-Instruct",
+			model="codellama:7b-instruct"
+		)
     ),
     system_message=None,
     temperature=0.5,
diff --git a/server/continuedev/libs/llm/__init__.py b/server/continuedev/libs/llm/__init__.py
index 829ffede..7ac92059 100644
--- a/server/continuedev/libs/llm/__init__.py
+++ b/server/continuedev/libs/llm/__init__.py
@@ -1,14 +1,4 @@
-from .anthropic import AnthropicLLM  # noqa: F401
-from .ggml import GGML  # noqa: F401
-from .google_palm_api import GooglePaLMAPI  # noqa: F401
-from .hf_inference_api import HuggingFaceInferenceAPI  # noqa: F401
-from .hf_tgi import HuggingFaceTGI  # noqa: F401
 from .llamacpp import LlamaCpp  # noqa: F401
 from .ollama import Ollama  # noqa: F401
-from .openai import OpenAI  # noqa: F401
-from .openai_free_trial import OpenAIFreeTrial  # noqa: F401
 from .proxy_server import ProxyServer  # noqa: F401
 from .queued import QueuedLLM  # noqa: F401
-from .replicate import ReplicateLLM  # noqa: F401
-from .text_gen_interface import TextGenUI  # noqa: F401
-from .together import TogetherLLM  # noqa: F401
diff --git a/server/continuedev/libs/llm/anthropic.py b/server/continuedev/libs/llm/anthropic.py
deleted file mode 100644
index 7d0708f1..00000000
--- a/server/continuedev/libs/llm/anthropic.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from typing import Any, Callable, Coroutine
-
-from anthropic import AI_PROMPT, HUMAN_PROMPT, AsyncAnthropic
-
-from .base import LLM, CompletionOptions
-from .prompts.chat import anthropic_template_messages
-
-
-class AnthropicLLM(LLM):
-    """
-    Import the `AnthropicLLM` class and set it as the default model:
-
-    ```python title="~/.continue/config.py"
-    from continuedev.libs.llm.anthropic import AnthropicLLM
-
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=AnthropicLLM(api_key="<API_KEY>", model="claude-2")
-        )
-    )
-    ```
-
-    Claude 2 is not yet publicly released. You can request early access [here](https://www.anthropic.com/earlyaccess).
-
-    """
-
-    api_key: str
-    "Anthropic API key"
-
-    model: str = "claude-2"
-
-    _async_client: AsyncAnthropic = None
-
-    template_messages: Callable = anthropic_template_messages
-
-    class Config:
-        arbitrary_types_allowed = True
-
-    async def start(self, **kwargs):
-        await super().start(**kwargs)
-        self._async_client = AsyncAnthropic(api_key=self.api_key)
-
-        if self.model == "claude-2":
-            self.context_length = 100_000
-
-    def collect_args(self, options: CompletionOptions):
-        options.stop = None
-        args = super().collect_args(options)
-
-        if "max_tokens" in args:
-            args["max_tokens_to_sample"] = args["max_tokens"]
-            del args["max_tokens"]
-        if "frequency_penalty" in args:
-            del args["frequency_penalty"]
-        if "presence_penalty" in args:
-            del args["presence_penalty"]
-        return args
-
-    async def _stream_complete(self, prompt: str, options):
-        args = self.collect_args(options)
-        prompt = f"{HUMAN_PROMPT} {prompt} {AI_PROMPT}"
-
-        async for chunk in await self._async_client.completions.create(
-            prompt=prompt, stream=True, **args
-        ):
-            yield chunk.completion
-
-    async def _complete(self, prompt: str, options) -> Coroutine[Any, Any, str]:
-        args = self.collect_args(options)
-        prompt = f"{HUMAN_PROMPT} {prompt} {AI_PROMPT}"
-        return (
-            await self._async_client.completions.create(prompt=prompt, **args)
-        ).completion
diff --git a/server/continuedev/libs/llm/ggml.py b/server/continuedev/libs/llm/ggml.py
deleted file mode 100644
index 55d580a8..00000000
--- a/server/continuedev/libs/llm/ggml.py
+++ /dev/null
@@ -1,226 +0,0 @@
-import json
-from typing import Any, Callable, Coroutine, Dict, List, Literal, Optional
-
-from pydantic import Field
-
-from ...core.main import ChatMessage
-from ..util.logging import logger
-from .base import LLM, CompletionOptions
-from .openai import CHAT_MODELS
-from .prompts.chat import llama2_template_messages
-from .prompts.edit import simplified_edit_prompt
-
-
-class GGML(LLM):
-    """
-    See our [5 minute quickstart](https://github.com/continuedev/ggml-server-example) to run any model locally with ggml. While these models don't yet perform as well, they are free, entirely private, and run offline.
-
-    Once the model is running on localhost:8000, change `~/.continue/config.py` to look like this:
-
-    ```python title="~/.continue/config.py"
-    from continuedev.libs.llm.ggml import GGML
-
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=GGML(
-                max_context_length=2048,
-                server_url="http://localhost:8000")
-        )
-    )
-    ```
-    """
-
-    server_url: str = Field(
-        "http://localhost:8000",
-        description="URL of the OpenAI-compatible server where the model is being served",
-    )
-    model: str = Field(
-        "ggml", description="The name of the model to use (optional for the GGML class)"
-    )
-    
-    api_base: Optional[str] = Field(None, description="OpenAI API base URL.")
-
-    api_type: Optional[Literal["azure", "openai"]] = Field(
-        None, description="OpenAI API type."
-    )
-
-    api_version: Optional[str] = Field(
-        None, description="OpenAI API version. For use with Azure OpenAI Service."
-    )
-
-    engine: Optional[str] = Field(
-        None, description="OpenAI engine. For use with Azure OpenAI Service."
-    )
-
-    template_messages: Optional[
-        Callable[[List[Dict[str, str]]], str]
-    ] = llama2_template_messages
-
-    prompt_templates = {
-        "edit": simplified_edit_prompt,
-    }
-
-    class Config:
-        arbitrary_types_allowed = True
-
-    def get_headers(self):
-        headers = {
-            "Content-Type": "application/json",
-        }
-        if self.api_key is not None:
-            if self.api_type == "azure":
-                headers["api-key"] = self.api_key
-            else:
-                headers["Authorization"] = f"Bearer {self.api_key}"
-
-        return headers
-    
-    def get_full_server_url(self, endpoint: str):
-        endpoint = endpoint.lstrip("/").rstrip("/")
-
-        if self.api_type == "azure":
-            if self.engine is None or self.api_version is None or self.api_base is None:
-                raise Exception(
-                    "For Azure OpenAI Service, you must specify engine, api_version, and api_base."
-                )
-            
-            return f"{self.api_base}/openai/deployments/{self.engine}/{endpoint}?api-version={self.api_version}"
-        else:
-            return f"{self.server_url}/v1/{endpoint}"
-
-    async def _raw_stream_complete(self, prompt, options):
-        args = self.collect_args(options)
-
-        async with self.create_client_session() as client_session:
-            async with client_session.post(
-                self.get_full_server_url(endpoint="completions"),
-                json={
-                    "prompt": prompt,
-                    "stream": True,
-                    **args,
-                },
-                headers=self.get_headers(),
-                proxy=self.proxy,
-            ) as resp:
-                if resp.status != 200:
-                    raise Exception(
-                        f"Error calling /chat/completions endpoint: {resp.status}"
-                    )
-
-                async for line in resp.content.iter_any():
-                    if line:
-                        chunks = line.decode("utf-8")
-                        for chunk in chunks.split("\n"):
-                            if (
-                                chunk.startswith(": ping - ")
-                                or chunk.startswith("data: [DONE]")
-                                or chunk.strip() == ""
-                            ):
-                                continue
-                            elif chunk.startswith("data: "):
-                                chunk = chunk[6:]
-                            try:
-                                j = json.loads(chunk)
-                            except Exception:
-                                continue
-                            if (
-                                "choices" in j
-                                and len(j["choices"]) > 0
-                                and "text" in j["choices"][0]
-                            ):
-                                yield j["choices"][0]["text"]
-
-    async def _stream_chat(self, messages: List[ChatMessage], options):
-        args = self.collect_args(options)
-
-        async def generator():
-            async with self.create_client_session() as client_session:
-                async with client_session.post(
-                    self.get_full_server_url(endpoint="chat/completions"),
-                    json={"messages": messages, "stream": True, **args},
-                    headers=self.get_headers(),
-                    proxy=self.proxy,
-                ) as resp:
-                    if resp.status != 200:
-                        raise Exception(
-                            f"Error calling /chat/completions endpoint: {resp.status}"
-                        )
-                    
-                    async for line, end in resp.content.iter_chunks():
-                        json_chunk = line.decode("utf-8")
-                        chunks = json_chunk.split("\n")
-                        for chunk in chunks:
-                            if (
-                                chunk.strip() == ""
-                                or json_chunk.startswith(": ping - ")
-                                or json_chunk.startswith("data: [DONE]")
-                            ):
-                                continue
-                            try:
-                                yield json.loads(chunk[6:])["choices"][0]["delta"]
-                            except:
-                                pass
-
-        # Because quite often the first attempt fails, and it works thereafter
-        try:
-            async for chunk in generator():
-                yield chunk
-        except Exception as e:
-            logger.warning(f"Error calling /chat/completions endpoint: {e}")
-            async for chunk in generator():
-                yield chunk
-
-    async def _raw_complete(self, prompt: str, options) -> Coroutine[Any, Any, str]:
-        args = self.collect_args(options)
-
-        async with self.create_client_session() as client_session:
-            async with client_session.post(
-                self.get_full_server_url(endpoint="completions"),
-                json={
-                    "prompt": prompt,
-                    **args,
-                },
-                headers=self.get_headers(),
-                proxy=self.proxy,
-            ) as resp:
-                if resp.status != 200:
-                    raise Exception(
-                        f"Error calling /chat/completions endpoint: {resp.status}"
-                    )
-
-                text = await resp.text()
-                try:
-                    completion = json.loads(text)["choices"][0]["text"]
-                    return completion
-                except Exception as e:
-                    raise Exception(
-                        f"Error calling /completion endpoint: {e}\n\nResponse text: {text}"
-                    )
-
-    async def _complete(self, prompt: str, options: CompletionOptions):
-        completion = ""
-        if self.model in CHAT_MODELS:
-            async for chunk in self._stream_chat(
-                [{"role": "user", "content": prompt}], options
-            ):
-                if "content" in chunk:
-                    completion += chunk["content"]
-
-        else:
-            async for chunk in self._raw_stream_complete(prompt, options):
-                completion += chunk
-
-        return completion
-
-    async def _stream_complete(self, prompt, options: CompletionOptions):
-        if self.model in CHAT_MODELS:
-            async for chunk in self._stream_chat(
-                [{"role": "user", "content": prompt}], options
-            ):
-                if "content" in chunk:
-                    yield chunk["content"]
-
-        else:
-            async for chunk in self._raw_stream_complete(prompt, options):
-                yield chunk
diff --git a/server/continuedev/libs/llm/google_palm_api.py b/server/continuedev/libs/llm/google_palm_api.py
deleted file mode 100644
index 3379fefe..00000000
--- a/server/continuedev/libs/llm/google_palm_api.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from typing import List
-
-import requests
-from pydantic import Field
-
-from ...core.main import ChatMessage
-from .base import LLM
-
-
-class GooglePaLMAPI(LLM):
-    """
-    The Google PaLM API is currently in public preview, so production applications are not supported yet. However, you can [create an API key in Google MakerSuite](https://makersuite.google.com/u/2/app/apikey) and begin trying out the `chat-bison-001` model. Change `~/.continue/config.py` to look like this:
-
-    ```python title="~/.continue/config.py"
-    from continuedev.core.models import Models
-    from continuedev.libs.llm.hf_inference_api import GooglePaLMAPI
-
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=GooglePaLMAPI(
-                model="chat-bison-001"
-                api_key="<MAKERSUITE_API_KEY>",
-        )
-    )
-    ```
-    """
-
-    api_key: str = Field(..., description="Google PaLM API key")
-
-    model: str = "chat-bison-001"
-
-    async def _stream_complete(self, prompt, options):
-        api_url = f"https://generativelanguage.googleapis.com/v1beta2/models/{self.model}:generateMessage?key={self.api_key}"
-        body = {"prompt": {"messages": [{"content": prompt}]}}
-        response = requests.post(api_url, json=body)
-        yield response.json()["candidates"][0]["content"]
-
-    async def _stream_chat(self, messages: List[ChatMessage], options):
-        msg_lst = []
-        for message in messages:
-            msg_lst.append({"content": message["content"]})
-
-        api_url = f"https://generativelanguage.googleapis.com/v1beta2/models/{self.model}:generateMessage?key={self.api_key}"
-        body = {"prompt": {"messages": msg_lst}}
-        response = requests.post(api_url, json=body)
-        yield {
-            "content": response.json()["candidates"][0]["content"],
-            "role": "assistant",
-        }
diff --git a/server/continuedev/libs/llm/hf_inference_api.py b/server/continuedev/libs/llm/hf_inference_api.py
deleted file mode 100644
index 990ec7c8..00000000
--- a/server/continuedev/libs/llm/hf_inference_api.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from typing import Callable, Dict, List, Union
-
-from huggingface_hub import InferenceClient
-from pydantic import Field
-
-from .base import LLM, CompletionOptions
-from .prompts.chat import llama2_template_messages
-from .prompts.edit import simplified_edit_prompt
-
-
-class HuggingFaceInferenceAPI(LLM):
-    """
-    Hugging Face Inference API is a great option for newly released language models. Sign up for an account and add billing [here](https://huggingface.co/settings/billing), access the Inference Endpoints [here](https://ui.endpoints.huggingface.co), click on “New endpoint”, and fill out the form (e.g. select a model like [WizardCoder-Python-34B-V1.0](https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0)), and then deploy your model by clicking “Create Endpoint”. Change `~/.continue/config.py` to look like this:
-
-    ```python title="~/.continue/config.py"
-    from continuedev.core.models import Models
-    from continuedev.libs.llm.hf_inference_api import HuggingFaceInferenceAPI
-
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=HuggingFaceInferenceAPI(
-                endpoint_url="<INFERENCE_API_ENDPOINT_URL>",
-                hf_token="<HUGGING_FACE_TOKEN>",
-        )
-    )
-    ```
-    """
-
-    model: str = Field(
-        "Hugging Face Inference API",
-        description="The name of the model to use (optional for the HuggingFaceInferenceAPI class)",
-    )
-    hf_token: str = Field(..., description="Your Hugging Face API token")
-    endpoint_url: str = Field(
-        None, description="Your Hugging Face Inference API endpoint URL"
-    )
-
-    template_messages: Union[
-        Callable[[List[Dict[str, str]]], str], None
-    ] = llama2_template_messages
-
-    prompt_templates = {
-        "edit": simplified_edit_prompt,
-    }
-
-    class Config:
-        arbitrary_types_allowed = True
-
-    def collect_args(self, options: CompletionOptions):
-        options.stop = None
-        args = super().collect_args(options)
-
-        if "max_tokens" in args:
-            args["max_new_tokens"] = args["max_tokens"]
-            del args["max_tokens"]
-        if "stop" in args:
-            args["stop_sequences"] = args["stop"]
-            del args["stop"]
-
-        return args
-
-    async def _stream_complete(self, prompt, options):
-        args = self.collect_args(options)
-
-        client = InferenceClient(self.endpoint_url, token=self.hf_token)
-
-        stream = client.text_generation(prompt, stream=True, details=True, **args)
-
-        for r in stream:
-            # skip special tokens
-            if r.token.special:
-                continue
-            # stop if we encounter a stop sequence
-            if options.stop is not None:
-                if r.token.text in options.stop:
-                    break
-            yield r.token.text
diff --git a/server/continuedev/libs/llm/hf_tgi.py b/server/continuedev/libs/llm/hf_tgi.py
deleted file mode 100644
index 62458db4..00000000
--- a/server/continuedev/libs/llm/hf_tgi.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import json
-from typing import Any, Callable, List
-
-from pydantic import Field
-
-from ...core.main import ChatMessage
-from .base import LLM, CompletionOptions
-from .prompts.chat import llama2_template_messages
-from .prompts.edit import simplified_edit_prompt
-
-
-class HuggingFaceTGI(LLM):
-    model: str = "huggingface-tgi"
-    server_url: str = Field(
-        "http://localhost:8080", description="URL of your TGI server"
-    )
-
-    template_messages: Callable[[List[ChatMessage]], str] = llama2_template_messages
-
-    prompt_templates = {
-        "edit": simplified_edit_prompt,
-    }
-
-    class Config:
-        arbitrary_types_allowed = True
-
-    def collect_args(self, options: CompletionOptions) -> Any:
-        args = super().collect_args(options)
-        args = {**args, "max_new_tokens": args.get("max_tokens", 1024), "best_of": 1}
-        args.pop("max_tokens", None)
-        args.pop("model", None)
-        args.pop("functions", None)
-        return args
-
-    async def _stream_complete(self, prompt, options):
-        args = self.collect_args(options)
-
-        async with self.create_client_session() as client_session:
-            async with client_session.post(
-                f"{self.server_url}/generate_stream",
-                json={"inputs": prompt, "parameters": args},
-                headers={"Content-Type": "application/json"},
-                proxy=self.proxy,
-            ) as resp:
-                async for line in resp.content.iter_any():
-                    if line:
-                        text = line.decode("utf-8")
-                        chunks = text.split("\n")
-
-                        for chunk in chunks:
-                            if chunk.startswith("data: "):
-                                chunk = chunk[len("data: ") :]
-                            elif chunk.startswith("data:"):
-                                chunk = chunk[len("data:") :]
-
-                            if chunk.strip() == "":
-                                continue
-
-                            try:
-                                json_chunk = json.loads(chunk)
-                            except Exception as e:
-                                print(f"Error parsing JSON: {e}")
-                                continue
-
-                            yield json_chunk["token"]["text"]
diff --git a/server/continuedev/libs/llm/hugging_face.py b/server/continuedev/libs/llm/hugging_face.py
deleted file mode 100644
index c2e934c0..00000000
--- a/server/continuedev/libs/llm/hugging_face.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# TODO: This class is far out of date
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from .llm import LLM
-
-
-class HuggingFace(LLM):
-    def __init__(self, model_path: str = "Salesforce/codegen-2B-mono"):
-        self.model_path = model_path
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-        self.model = AutoModelForCausalLM.from_pretrained(model_path)
-
-    def complete(self, prompt: str, **kwargs):
-        args = {"max_tokens": 100}
-        args.update(kwargs)
-        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
-        generated_ids = self.model.generate(input_ids, max_length=args["max_tokens"])
-        return self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
diff --git a/server/continuedev/libs/llm/openai.py b/server/continuedev/libs/llm/openai.py
deleted file mode 100644
index ba29279b..00000000
--- a/server/continuedev/libs/llm/openai.py
+++ /dev/null
@@ -1,156 +0,0 @@
-from typing import Callable, List, Literal, Optional
-
-import certifi
-import openai
-from pydantic import Field
-
-from ...core.main import ChatMessage
-from .base import LLM
-
-CHAT_MODELS = {
-    "gpt-3.5-turbo",
-    "gpt-3.5-turbo-16k",
-    "gpt-4",
-    "gpt-3.5-turbo-0613",
-    "gpt-4-32k",
-}
-MAX_TOKENS_FOR_MODEL = {
-    "gpt-3.5-turbo": 4096,
-    "gpt-3.5-turbo-0613": 4096,
-    "gpt-3.5-turbo-16k": 16_384,
-    "gpt-4": 8192,
-    "gpt-35-turbo-16k": 16_384,
-    "gpt-35-turbo-0613": 4096,
-    "gpt-35-turbo": 4096,
-    "gpt-4-32k": 32_768,
-}
-
-
-class OpenAI(LLM):
-    """
-    The OpenAI class can be used to access OpenAI models like gpt-4 and gpt-3.5-turbo.
-
-    If you are locally serving a model that uses an OpenAI-compatible server, you can simply change the `api_base` in the `OpenAI` class like this:
-
-    ```python title="~/.continue/config.py"
-    from continuedev.libs.llm.openai import OpenAI
-
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=OpenAI(
-                api_key="EMPTY",
-                model="<MODEL_NAME>",
-                api_base="http://localhost:8000", # change to your server
-            )
-        )
-    )
-    ```
-
-    Options for serving models locally with an OpenAI-compatible server include:
-
-    - [text-gen-webui](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai#setup--installation)
-    - [FastChat](https://github.com/lm-sys/FastChat/blob/main/docs/openai_api.md)
-    - [LocalAI](https://localai.io/basics/getting_started/)
-    - [llama-cpp-python](https://github.com/abetlen/llama-cpp-python#web-server)
-    """
-
-    api_key: str = Field(
-        ...,
-        description="OpenAI API key",
-    )
-
-    proxy: Optional[str] = Field(None, description="Proxy URL to use for requests.")
-
-    api_base: Optional[str] = Field(None, description="OpenAI API base URL.")
-
-    api_type: Optional[Literal["azure", "openai"]] = Field(
-        None, description="OpenAI API type."
-    )
-
-    api_version: Optional[str] = Field(
-        None, description="OpenAI API version. For use with Azure OpenAI Service."
-    )
-
-    engine: Optional[str] = Field(
-        None, description="OpenAI engine. For use with Azure OpenAI Service."
-    )
-
-    async def start(
-        self, unique_id: Optional[str] = None, write_log: Callable[[str], None] = None
-    ):
-        await super().start(write_log=write_log, unique_id=unique_id)
-
-        if self.context_length is None:
-            self.context_length = MAX_TOKENS_FOR_MODEL.get(self.model, 4096)
-
-        openai.api_key = self.api_key
-        if self.api_type is not None:
-            openai.api_type = self.api_type
-        if self.api_base is not None:
-            openai.api_base = self.api_base
-        if self.api_version is not None:
-            openai.api_version = self.api_version
-
-        if self.verify_ssl is not None and self.verify_ssl is False:
-            openai.verify_ssl_certs = False
-
-        if self.proxy is not None:
-            openai.proxy = self.proxy
-
-        openai.ca_bundle_path = self.ca_bundle_path or certifi.where()
-
-    def collect_args(self, options):
-        args = super().collect_args(options)
-        if self.engine is not None:
-            args["engine"] = self.engine
-
-        if not args["model"].endswith("0613") and "functions" in args:
-            del args["functions"]
-
-        return args
-
-    async def _stream_complete(self, prompt, options):
-        args = self.collect_args(options)
-        args["stream"] = True
-
-        if args["model"] in CHAT_MODELS:
-            async for chunk in await openai.ChatCompletion.acreate(
-                messages=[{"role": "user", "content": prompt}],
-                **args,
-                headers=self.headers,
-            ):
-                if len(chunk.choices) > 0 and "content" in chunk.choices[0].delta:
-                    yield chunk.choices[0].delta.content
-        else:
-            async for chunk in await openai.Completion.acreate(prompt=prompt, **args, headers=self.headers):
-                if len(chunk.choices) > 0:
-                    yield chunk.choices[0].text
-
-    async def _stream_chat(self, messages: List[ChatMessage], options):
-        args = self.collect_args(options)
-
-        async for chunk in await openai.ChatCompletion.acreate(
-            messages=messages,
-            stream=True,
-            **args,
-            headers=self.headers,
-        ):
-            if not hasattr(chunk, "choices") or len(chunk.choices) == 0:
-                continue
-            yield chunk.choices[0].delta
-
-    async def _complete(self, prompt: str, options):
-        args = self.collect_args(options)
-
-        if args["model"] in CHAT_MODELS:
-            resp = await openai.ChatCompletion.acreate(
-                messages=[{"role": "user", "content": prompt}],
-                **args,
-                headers=self.headers,
-            )
-            return resp.choices[0].message.content
-        else:
-            return (
-                (await openai.Completion.acreate(prompt=prompt, **args, headers=self.headers)).choices[0].text
-            )
diff --git a/server/continuedev/libs/llm/openai_free_trial.py b/server/continuedev/libs/llm/openai_free_trial.py
deleted file mode 100644
index b6e707f9..00000000
--- a/server/continuedev/libs/llm/openai_free_trial.py
+++ /dev/null
@@ -1,83 +0,0 @@
-from typing import Callable, List, Optional
-
-from ...core.main import ChatMessage
-from .base import LLM
-from .openai import OpenAI
-from .proxy_server import ProxyServer
-
-
-class OpenAIFreeTrial(LLM):
-    """
-    With the `OpenAIFreeTrial` `LLM`, new users can try out Continue with GPT-4 using a proxy server that securely makes calls to OpenAI using our API key. Continue should just work the first time you install the extension in VS Code.
-
-    Once you are using Continue regularly though, you will need to add an OpenAI API key that has access to GPT-4 by following these steps:
-
-    1. Copy your API key from https://platform.openai.com/account/api-keys
-    2. Open `~/.continue/config.py`. You can do this by using the '/config' command in Continue
-    3. Change the default LLMs to look like this:
-
-    ```python title="~/.continue/config.py"
-    API_KEY = "<API_KEY>"
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=OpenAIFreeTrial(model="gpt-4", api_key=API_KEY),
-            summarize=OpenAIFreeTrial(model="gpt-3.5-turbo", api_key=API_KEY)
-        )
-    )
-    ```
-
-    The `OpenAIFreeTrial` class will automatically switch to using your API key instead of ours. If you'd like to explicitly use one or the other, you can use the `ProxyServer` or `OpenAI` classes instead.
-
-    These classes support any models available through the OpenAI API, assuming your API key has access, including "gpt-4", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", and "gpt-4-32k".
-    """
-
-    api_key: Optional[str] = None
-
-    llm: Optional[LLM] = None
-
-    def update_llm_properties(self):
-        if self.llm is not None:
-            self.llm.system_message = self.system_message
-
-    async def start(
-        self, write_log: Callable[[str], None] = None, unique_id: Optional[str] = None
-    ):
-        await super().start(write_log=write_log, unique_id=unique_id)
-        if self.api_key is None or self.api_key.strip() == "":
-            self.llm = ProxyServer(
-                model=self.model,
-                verify_ssl=self.verify_ssl,
-                ca_bundle_path=self.ca_bundle_path,
-            )
-        else:
-            self.llm = OpenAI(
-                api_key=self.api_key,
-                model=self.model,
-                verify_ssl=self.verify_ssl,
-                ca_bundle_path=self.ca_bundle_path,
-            )
-
-        await self.llm.start(write_log=write_log, unique_id=unique_id)
-
-    async def stop(self):
-        await self.llm.stop()
-
-    async def _complete(self, prompt: str, options):
-        self.update_llm_properties()
-        return await self.llm._complete(prompt, options)
-
-    async def _stream_complete(self, prompt, options):
-        self.update_llm_properties()
-        resp = self.llm._stream_complete(prompt, options)
-        async for item in resp:
-            yield item
-
-    async def _stream_chat(self, messages: List[ChatMessage], options):
-        self.update_llm_properties()
-        resp = self.llm._stream_chat(messages=messages, options=options)
-        async for item in resp:
-            yield item
-
-    def count_tokens(self, text: str):
-        return self.llm.count_tokens(text)
diff --git a/server/continuedev/libs/llm/replicate.py b/server/continuedev/libs/llm/replicate.py
deleted file mode 100644
index 3423193b..00000000
--- a/server/continuedev/libs/llm/replicate.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import concurrent.futures
-from typing import List
-
-import replicate
-from pydantic import Field
-
-from ...core.main import ChatMessage
-from .base import LLM
-from .prompts.edit import simplified_edit_prompt
-
-
-class ReplicateLLM(LLM):
-    """
-    Replicate is a great option for newly released language models or models that you've deployed through their platform. Sign up for an account [here](https://replicate.ai/), copy your API key, and then select any model from the [Replicate Streaming List](https://replicate.com/collections/streaming-language-models). Change `~/.continue/config.py` to look like this:
-
-    ```python title="~/.continue/config.py"
-    from continuedev.core.models import Models
-    from continuedev.libs.llm.replicate import ReplicateLLM
-
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=ReplicateLLM(
-                model="replicate/codellama-13b-instruct:da5676342de1a5a335b848383af297f592b816b950a43d251a0a9edd0113604b",
-                api_key="my-replicate-api-key")
-        )
-    )
-    ```
-
-    If you don't specify the `model` parameter, it will default to `replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781`.
-    """
-
-    api_key: str = Field(..., description="Replicate API key")
-
-    model: str = "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781"
-
-    _client: replicate.Client = None
-
-    prompt_templates = {
-        "edit": simplified_edit_prompt,
-    }
-
-    async def start(self, **kwargs):
-        await super().start(**kwargs)
-        self._client = replicate.Client(api_token=self.api_key)
-
-    async def _complete(self, prompt: str, options):
-        def helper():
-            output = self._client.run(
-                self.model, input={"message": prompt, "prompt": prompt}
-            )
-            completion = ""
-            for item in output:
-                completion += item
-
-            return completion
-
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            future = executor.submit(helper)
-            completion = future.result()
-
-        return completion
-
-    async def _stream_complete(self, prompt, options):
-        for item in self._client.run(
-            self.model, input={"message": prompt, "prompt": prompt}
-        ):
-            yield item
-
-    async def _stream_chat(self, messages: List[ChatMessage], options):
-        for item in self._client.run(
-            self.model,
-            input={
-                "message": messages[-1]["content"],
-                "prompt": messages[-1]["content"],
-            },
-        ):
-            yield {"content": item, "role": "assistant"}
diff --git a/server/continuedev/libs/llm/text_gen_interface.py b/server/continuedev/libs/llm/text_gen_interface.py
deleted file mode 100644
index 225fd3b6..00000000
--- a/server/continuedev/libs/llm/text_gen_interface.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import json
-from typing import Any, Callable, Dict, List, Union
-
-import websockets
-from pydantic import Field
-
-from ...core.main import ChatMessage
-from .base import LLM
-from .prompts.chat import llama2_template_messages
-from .prompts.edit import simplest_edit_prompt
-
-
-class TextGenUI(LLM):
-    """
-    TextGenUI is a comprehensive, open-source language model UI and local server. You can set it up with an OpenAI-compatible server plugin, but if for some reason that doesn't work, you can use this class like so:
-
-    ```python title="~/.continue/config.py"
-    from continuedev.libs.llm.text_gen_interface import TextGenUI
-
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=TextGenUI(
-                model="<MODEL_NAME>",
-            )
-        )
-    )
-    ```
-    """
-
-    model: str = "text-gen-ui"
-    server_url: str = Field(
-        "http://localhost:5000", description="URL of your TextGenUI server"
-    )
-    streaming_url: str = Field(
-        "http://localhost:5005",
-        description="URL of your TextGenUI streaming server (separate from main server URL)",
-    )
-
-    prompt_templates = {
-        "edit": simplest_edit_prompt,
-    }
-
-    template_messages: Union[
-        Callable[[List[Dict[str, str]]], str], None
-    ] = llama2_template_messages
-
-    class Config:
-        arbitrary_types_allowed = True
-
-    def collect_args(self, options) -> Any:
-        args = super().collect_args(options)
-        args = {**args, "max_new_tokens": options.max_tokens}
-        args.pop("max_tokens", None)
-        return args
-
-    async def _stream_complete(self, prompt, options):
-        args = self.collect_args(options)
-
-        ws_url = f"{self.streaming_url.replace('http://', 'ws://').replace('https://', 'wss://')}"
-        payload = json.dumps({"prompt": prompt, "stream": True, **args})
-        async with websockets.connect(
-            f"{ws_url}/api/v1/stream", ping_interval=None
-        ) as websocket:
-            await websocket.send(payload)
-
-            while True:
-                incoming_data = await websocket.recv()
-                incoming_data = json.loads(incoming_data)
-
-                match incoming_data["event"]:
-                    case "text_stream":
-                        yield incoming_data["text"]
-                    case "stream_end":
-                        break
-
-    async def _stream_chat(self, messages: List[ChatMessage], options):
-        args = self.collect_args(options)
-
-        async def generator():
-            ws_url = f"{self.streaming_url.replace('http://', 'ws://').replace('https://', 'wss://')}"
-            history = list(map(lambda x: x["content"], messages))
-            payload = json.dumps(
-                {
-                    "user_input": messages[-1]["content"],
-                    "history": {"internal": [history], "visible": [history]},
-                    "stream": True,
-                    **args,
-                }
-            )
-            async with websockets.connect(
-                f"{ws_url}/api/v1/chat-stream", ping_interval=None
-            ) as websocket:
-                await websocket.send(payload)
-
-                prev = ""
-                while True:
-                    incoming_data = await websocket.recv()
-                    incoming_data = json.loads(incoming_data)
-
-                    match incoming_data["event"]:
-                        case "text_stream":
-                            visible = incoming_data["history"]["visible"][-1]
-                            if len(visible) > 0:
-                                yield {
-                                    "role": "assistant",
-                                    "content": visible[-1].replace(prev, ""),
-                                }
-                                prev = visible[-1]
-                        case "stream_end":
-                            break
-
-        async for chunk in generator():
-            yield chunk
diff --git a/server/continuedev/libs/llm/together.py b/server/continuedev/libs/llm/together.py
deleted file mode 100644
index 35b3a424..00000000
--- a/server/continuedev/libs/llm/together.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import json
-from typing import Callable
-
-import aiohttp
-from pydantic import Field
-
-from ...core.main import ContinueCustomException
-from ..util.logging import logger
-from .base import LLM
-from .prompts.chat import llama2_template_messages
-from .prompts.edit import simplified_edit_prompt
-
-
-class TogetherLLM(LLM):
-    """
-    The Together API is a cloud platform for running large AI models. You can sign up [here](https://api.together.xyz/signup), copy your API key on the initial welcome screen, and then hit the play button on any model from the [Together Models list](https://docs.together.ai/docs/models-inference). Change `~/.continue/config.py` to look like this:
-
-    ```python title="~/.continue/config.py"
-    from continuedev.core.models import Models
-    from continuedev.libs.llm.together import TogetherLLM
-
-    config = ContinueConfig(
-        ...
-        models=Models(
-            default=TogetherLLM(
-                api_key="<API_KEY>",
-                model="togethercomputer/llama-2-13b-chat"
-            )
-        )
-    )
-    ```
-    """
-
-    api_key: str = Field(..., description="Together API key")
-
-    model: str = "togethercomputer/RedPajama-INCITE-7B-Instruct"
-    base_url: str = Field(
-        "https://api.together.xyz",
-        description="The base URL for your Together API instance",
-    )
-
-    _client_session: aiohttp.ClientSession = None
-
-    template_messages: Callable = llama2_template_messages
-
-    prompt_templates = {
-        "edit": simplified_edit_prompt,
-    }
-
-    async def start(self, **kwargs):
-        await super().start(**kwargs)
-        self._client_session = aiohttp.ClientSession(
-            connector=aiohttp.TCPConnector(verify_ssl=self.verify_ssl),
-            timeout=aiohttp.ClientTimeout(total=self.timeout),
-        )
-
-    async def stop(self):
-        await self._client_session.close()
-
-    async def _stream_complete(self, prompt, options):
-        args = self.collect_args(options)
-
-        async with self._client_session.post(
-            f"{self.base_url}/inference",
-            json={
-                "prompt": prompt,
-                "stream_tokens": True,
-                **args,
-            },
-            headers={"Authorization": f"Bearer {self.api_key}"},
-            proxy=self.proxy,
-        ) as resp:
-            async for line in resp.content.iter_chunks():
-                if line[1]:
-                    json_chunk = line[0].decode("utf-8")
-                    if json_chunk.startswith(": ping - ") or json_chunk.startswith(
-                        "data: [DONE]"
-                    ):
-                        continue
-
-                    chunks = json_chunk.split("\n")
-                    for chunk in chunks:
-                        if chunk.strip() != "":
-                            if chunk.startswith("data: "):
-                                chunk = chunk[6:]
-                            if chunk == "[DONE]":
-                                break
-                            try:
-                                json_chunk = json.loads(chunk)
-                            except Exception as e:
-                                logger.warning(f"Invalid JSON chunk: {chunk}\n\n{e}")
-                                continue
-                            if "choices" in json_chunk:
-                                yield json_chunk["choices"][0]["text"]
-
-    async def _complete(self, prompt: str, options):
-        args = self.collect_args(options)
-
-        async with self._client_session.post(
-            f"{self.base_url}/inference",
-            json={"prompt": prompt, **args},
-            headers={"Authorization": f"Bearer {self.api_key}"},
-            proxy=self.proxy,
-        ) as resp:
-            text = await resp.text()
-            j = json.loads(text)
-            try:
-                if "choices" not in j["output"]:
-                    raise Exception(text)
-                if "output" in j:
-                    return j["output"]["choices"][0]["text"]
-            except Exception as e:
-                j = await resp.json()
-                if "error" in j:
-                    if j["error"].startswith("invalid hexlify value"):
-                        raise ContinueCustomException(
-                            message=f"Invalid Together API key:\n\n{j['error']}",
-                            title="Together API Error",
-                        )
-                    else:
-                        raise ContinueCustomException(
-                            message=j["error"], title="Together API Error"
-                        )
-
-                raise e
diff --git a/server/continuedev/models/reference/generate.py b/server/continuedev/models/reference/generate.py
index b17df3b2..43e88750 100644
--- a/server/continuedev/models/reference/generate.py
+++ b/server/continuedev/models/reference/generate.py
@@ -4,19 +4,9 @@ import json
 from textwrap import dedent
 
 LLM_MODULES = [
-    ("openai", "OpenAI"),
-    ("anthropic", "AnthropicLLM"),
-    ("ggml", "GGML"),
     ("llamacpp", "LlamaCpp"),
-    ("text_gen_interface", "TextGenUI"),
     ("ollama", "Ollama"),
-    ("replicate", "ReplicateLLM"),
-    ("together", "TogetherLLM"),
-    ("hf_inference_api", "HuggingFaceInferenceAPI"),
-    ("hf_tgi", "HuggingFaceTGI"),
-    ("openai_free_trial", "OpenAIFreeTrial"),
-    ("google_palm_api", "GooglePaLMAPI"),
-    ("queued", "QueuedLLM"),
+    ("queued", "QueuedLLM")
 ]
 
 CONTEXT_PROVIDER_MODULES = [
diff --git a/server/continuedev/plugins/steps/chat.py b/server/continuedev/plugins/steps/chat.py
index 1b0f76f9..919d939e 100644
--- a/server/continuedev/plugins/steps/chat.py
+++ b/server/continuedev/plugins/steps/chat.py
@@ -4,26 +4,17 @@ import os
 from textwrap import dedent
 from typing import Any, Coroutine, List
 
-import openai
 from directory_tree import display_tree
 from dotenv import load_dotenv
 from pydantic import Field
 
-from ...core.main import ChatMessage, FunctionCall, Models, Step, step_to_json_schema
+from ...core.main import ChatMessage, Models, Step, step_to_json_schema
 from ...core.sdk import ContinueSDK
-from ...core.steps import MessageStep
-from ...libs.llm.openai import OpenAI
-from ...libs.llm.openai_free_trial import OpenAIFreeTrial
 from ...libs.util.devdata import dev_data_logger
 from ...libs.util.strings import remove_quotes_and_escapes
 from ...libs.util.telemetry import posthog_logger
-from .main import EditHighlightedCodeStep
 
 load_dotenv()
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-openai.api_key = OPENAI_API_KEY
-
-FREE_USAGE_STEP_NAME = "Please enter OpenAI API key"
 
 
 def add_ellipsis(text: str, max_length: int = 200) -> str:
@@ -40,48 +31,6 @@ class SimpleChatStep(Step):
 
     async def run(self, sdk: ContinueSDK):
         # Check if proxy server API key
-        if (
-            isinstance(sdk.models.default, OpenAIFreeTrial)
-            and (
-                sdk.models.default.api_key is None
-                or sdk.models.default.api_key.strip() == ""
-            )
-            and len(list(filter(lambda x: not x.step.hide, sdk.history.timeline))) >= 10
-            and len(
-                list(
-                    filter(
-                        lambda x: x.step.name == FREE_USAGE_STEP_NAME,
-                        sdk.history.timeline,
-                    )
-                )
-            )
-            == 0
-        ):
-            await sdk.run_step(
-                MessageStep(
-                    name=FREE_USAGE_STEP_NAME,
-                    message=dedent(
-                        """\
-                                To make it easier to use Continue, you're getting limited free usage. When you have the chance, please enter your own OpenAI key in `~/.continue/config.py`. You can open the file by using the '/config' slash command in the text box below.
-                                
-                                Here's an example of how to edit the file:
-                                ```python
-                                ...
-                                config=ContinueConfig(
-                                    ...
-                                    models=Models(
-                                        default=OpenAIFreeTrial(api_key="<API_KEY>", model="gpt-4"),
-                                        summarize=OpenAIFreeTrial(api_key="<API_KEY>", model="gpt-3.5-turbo")
-                                    )
-                                )
-                               ```
-
-                               You can also learn more about customizations [here](https://continue.dev/docs/customization).
-                               """
-                    ),
-                )
-            )
-
         messages = self.messages or await sdk.get_chat_context()
 
         generator = sdk.models.chat.stream_chat(
@@ -232,148 +181,3 @@ class EditFileStep(Step):
     async def run(self, sdk: ContinueSDK):
         await sdk.edit_file(self.filename, self.instructions)
 
-
-class ChatWithFunctions(Step):
-    user_input: str
-    functions: List[Step] = [
-        AddFileStep(filename="", file_contents=""),
-        EditFileStep(filename="", instructions=""),
-        EditHighlightedCodeStep(user_input=""),
-        ViewDirectoryTreeStep(),
-        AddDirectoryStep(directory_name=""),
-        DeleteFileStep(filename=""),
-        RunTerminalCommandStep(command=""),
-    ]
-    name: str = "Input"
-    manage_own_chat_context: bool = True
-    description: str = ""
-    hide: bool = True
-
-    async def run(self, sdk: ContinueSDK):
-        await sdk.update_ui()
-
-        step_name_step_class_map = {
-            step.name.replace(" ", ""): step.__class__ for step in self.functions
-        }
-
-        functions = [step_to_json_schema(function) for function in self.functions]
-
-        self.chat_context.append(
-            ChatMessage(role="user", content=self.user_input, summary=self.user_input)
-        )
-
-        last_function_called_name = None
-        last_function_called_params = None
-        while True:
-            was_function_called = False
-            func_args = ""
-            func_name = ""
-            msg_content = ""
-            msg_step = None
-
-            gpt350613 = OpenAI(model="gpt-3.5-turbo-0613")
-            await sdk.start_model(gpt350613)
-
-            async for msg_chunk in gpt350613.stream_chat(
-                await sdk.get_chat_context(), functions=functions
-            ):
-                if sdk.current_step_was_deleted():
-                    return
-
-                if "content" in msg_chunk and msg_chunk["content"] is not None:
-                    msg_content += msg_chunk["content"]
-                    # if last_function_called_index_in_history is not None:
-                    #     while sdk.history.timeline[last_function_called_index].step.hide:
-                    #         last_function_called_index += 1
-                    #     sdk.history.timeline[last_function_called_index_in_history].step.description = msg_content
-                    if msg_step is None:
-                        msg_step = MessageStep(
-                            name="Chat", message=msg_chunk["content"]
-                        )
-                        await sdk.run_step(msg_step)
-                    else:
-                        msg_step.description = msg_content
-                    await sdk.update_ui()
-                elif "function_call" in msg_chunk or func_name != "":
-                    was_function_called = True
-                    if "function_call" in msg_chunk:
-                        if "arguments" in msg_chunk["function_call"]:
-                            func_args += msg_chunk["function_call"]["arguments"]
-                        if "name" in msg_chunk["function_call"]:
-                            func_name += msg_chunk["function_call"]["name"]
-
-            if not was_function_called:
-                self.chat_context.append(
-                    ChatMessage(
-                        role="assistant", content=msg_content, summary=msg_content
-                    )
-                )
-                break
-            else:
-                if func_name == "python" and "python" not in step_name_step_class_map:
-                    # GPT must be fine-tuned to believe this exists, but it doesn't always
-                    func_name = "EditHighlightedCodeStep"
-                    func_args = json.dumps({"user_input": self.user_input})
-                    # self.chat_context.append(ChatMessage(
-                    #     role="assistant",
-                    #     content=None,
-                    #     function_call=FunctionCall(
-                    #         name=func_name,
-                    #         arguments=func_args
-                    #     ),
-                    #     summary=f"Called function {func_name}"
-                    # ))
-                    # self.chat_context.append(ChatMessage(
-                    #     role="user",
-                    #     content="The 'python' function does not exist. Don't call it. Try again to call another function.",
-                    #     summary="'python' function does not exist."
-                    # ))
-                    # msg_step.hide = True
-                    # continue
-                # Call the function, then continue to chat
-                func_args = "{}" if func_args == "" else func_args
-                try:
-                    fn_call_params = json.loads(func_args)
-                except json.JSONDecodeError:
-                    raise Exception("The model returned invalid JSON. Please try again")
-                self.chat_context.append(
-                    ChatMessage(
-                        role="assistant",
-                        content=None,
-                        function_call=FunctionCall(name=func_name, arguments=func_args),
-                        summary=f"Called function {func_name}",
-                    )
-                )
-                sdk.history.current_index + 1
-                if func_name not in step_name_step_class_map:
-                    raise Exception(
-                        f"The model tried to call a function ({func_name}) that does not exist. Please try again."
-                    )
-
-                # if func_name == "AddFileStep":
-                #     step_to_run.hide = True
-                #     self.description += f"\nAdded file `{func_args['filename']}`"
-                # elif func_name == "AddDirectoryStep":
-                #     step_to_run.hide = True
-                #     self.description += f"\nAdded directory `{func_args['directory_name']}`"
-                # else:
-                #     self.description += f"\n`Running function {func_name}`\n\n"
-                if func_name == "EditHighlightedCodeStep":
-                    fn_call_params["user_input"] = self.user_input
-                elif func_name == "EditFile":
-                    fn_call_params["instructions"] = self.user_input
-
-                step_to_run = step_name_step_class_map[func_name](**fn_call_params)
-                if (
-                    last_function_called_name is not None
-                    and last_function_called_name == func_name
-                    and last_function_called_params is not None
-                    and last_function_called_params == fn_call_params
-                ):
-                    # If it's calling the same function more than once in a row, it's probably looping and confused
-                    return
-                last_function_called_name = func_name
-                last_function_called_params = fn_call_params
-
-                await sdk.run_step(step_to_run)
-                await sdk.update_ui()
diff --git a/server/continuedev/plugins/steps/setup_model.py b/server/continuedev/plugins/steps/setup_model.py
index 87e52f1b..e7249594 100644
--- a/server/continuedev/plugins/steps/setup_model.py
+++ b/server/continuedev/plugins/steps/setup_model.py
@@ -5,16 +5,8 @@ from ...models.filesystem import RangeInFile
 from ...models.main import Range
 
 MODEL_CLASS_TO_MESSAGE = {
-    "OpenAI": "Obtain your OpenAI API key from [here](https://platform.openai.com/account/api-keys) and paste it into the `api_key` field at config.models.default.api_key in `config.py`. Then reload the VS Code window for changes to take effect.",
-    "OpenAIFreeTrial": "To get started with OpenAI models, obtain your OpenAI API key from [here](https://platform.openai.com/account/api-keys) and paste it into the `api_key` field at config.models.default.api_key in `config.py`. Then reload the VS Code window for changes to take effect.",
-    "AnthropicLLM": "To get started with Anthropic, you first need to sign up for the beta [here](https://claude.ai/login) to obtain an API key. Once you have the key, paste it into the `api_key` field at config.models.default.api_key in `config.py`. Then reload the VS Code window for changes to take effect.",
-    "ReplicateLLM": "To get started with Replicate, sign up to obtain an API key [here](https://replicate.ai/), then paste it into the `api_key` field at config.models.default.api_key in `config.py`.",
     "Ollama": "To get started with Ollama, download the app from [ollama.ai](https://ollama.ai/). Once it is downloaded, be sure to pull at least one model and use its name in the model field in config.py (e.g. `model='codellama'`).",
-    "GGML": "GGML models can be run locally using the `llama-cpp-python` library. To learn how to set up a local llama-cpp-python server, read [here](https://github.com/continuedev/ggml-server-example). Once it is started on port 8000, you're all set!",
-    "TogetherLLM": "To get started using models from Together, first obtain your Together API key from [here](https://together.ai). Paste it into the `api_key` field at config.models.default.api_key in `config.py`. Then, on their models page, press 'start' on the model of your choice and make sure the `model=` parameter in the config file for the `TogetherLLM` class reflects the name of this model. Finally, reload the VS Code window for changes to take effect.",
-    "LlamaCpp": "To get started with this model, clone the [`llama.cpp` repo](https://github.com/ggerganov/llama.cpp) and follow the instructions to set up the server [here](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#build). Any of the parameters described in the README can be passed to the `llama_cpp_args` field in the `LlamaCpp` class in `config.py`.",
-    "HuggingFaceInferenceAPI": "To get started with the HuggingFace Inference API, first deploy a model and obtain your API key from [here](https://huggingface.co/inference-api). Paste it into the `hf_token` field at config.models.default.hf_token in `config.py`. Finally, reload the VS Code window for changes to take effect.",
-    "GooglePaLMAPI": "To get started with the Google PaLM API, create an API key in Makersuite [here](https://makersuite.google.com/u/2/app/apikey), then paste it into the `api_key` field at config.models.default.api_key in `config.py`.",
+    "LlamaCpp": "To get started with this model, clone the [`llama.cpp` repo](https://github.com/ggerganov/llama.cpp) and follow the instructions to set up the server [here](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#build). Any of the parameters described in the README can be passed to the `llama_cpp_args` field in the `LlamaCpp` class in `config.py`."
 }
 
 
diff --git a/server/tests/util/config.py b/server/tests/util/config.py
index 370933a0..82811bc1 100644
--- a/server/tests/util/config.py
+++ b/server/tests/util/config.py
@@ -1,15 +1,18 @@
 from continuedev.core.config import ContinueConfig
 from continuedev.core.models import Models
-from continuedev.libs.llm.openai_free_trial import OpenAIFreeTrial
+from continuedev.libs.llm import Ollama
 
 config = ContinueConfig(
     allow_anonymous_telemetry=False,
     models=Models(
-        default=OpenAIFreeTrial(api_key="", model="gpt-4"),
-        summarize=OpenAIFreeTrial(
-            api_key="",
-            model="gpt-3.5-turbo",
-        ),
+        default=Ollama(
+			title="CodeLlama-7b-Instruct",
+			model="codellama:7b-instruct"
+		),
+        summarize=Ollama(
+			title="CodeLlama-7b-Instruct",
+			model="codellama:7b-instruct"
+		)
     ),
     system_message=None,
     temperature=0.5,