server/continuedev/libs/llm/queued.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

import asyncio
from typing import Any, List, Union

from pydantic import Field

from ...core.main import ChatMessage
from .base import LLM, CompletionOptions


class QueuedLLM(LLM):
    """
    QueuedLLM exists to make up for LLM servers that cannot handle multiple requests at once. It uses a lock to ensure that only one request is being processed at a time.

    If you are already using another LLM class and are experiencing this problem, you can just wrap it with the QueuedLLM class like this:

    ```python title="~/.continue/config.py"
    from continuedev.libs.llm.queued import QueuedLLM

    config = ContinueConfig(
        ...
        models=Models(
            default=QueuedLLM(llm=<OTHER_LLM_CLASS>)
        )
    )
    ```
    """

    llm: LLM = Field(..., description="The LLM to wrap with a lock")
    _lock: asyncio.Lock

    model: str = "queued"

    def dict(self, **kwargs):
        return self.llm.dict(**kwargs)

    async def start(self, *args, **kwargs):
        await super().start(*args, **kwargs)
        await self.llm.start(*args, **kwargs)
        self._lock = asyncio.Lock()
        self.model = self.llm.model
        self.template_messages = self.llm.template_messages
        self.prompt_templates = self.llm.prompt_templates
        self.context_length = self.llm.context_length

    async def stop(self):
        await self.llm.stop()

    def collect_args(self, options: CompletionOptions):
        return self.llm.collect_args(options)

    def compile_chat_messages(
        self,
        options: CompletionOptions,
        msgs: List[ChatMessage],
        functions: Union[List[Any], None] = None,
    ):
        return self.llm.compile_chat_messages(options, msgs, functions)

    def template_prompt_like_messages(self, prompt: str) -> str:
        return self.llm.template_prompt_like_messages(prompt)

    async def _complete(self, prompt: str, options: CompletionOptions):
        async with self._lock:
            resp = await self.llm._complete(prompt, options)
            return resp

    async def _stream_complete(self, prompt: str, options: CompletionOptions):
        async with self._lock:
            async for chunk in self.llm._stream_complete(prompt, options):
                yield chunk

    async def _stream_chat(
        self, messages: List[ChatMessage], options: CompletionOptions
    ):
        async with self._lock:
            async for chunk in self.llm._stream_chat(messages, options):
                yield chunk