summaryrefslogtreecommitdiff
path: root/server/continuedev/plugins/context_providers/url.py
blob: 1ed7c18e9ff1218fc77da3558dafeb1115639dbd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from typing import List

import requests
from bs4 import BeautifulSoup
from pydantic import Field

from ...core.context import ContextProvider
from ...core.main import ContextItem, ContextItemDescription, ContextItemId
from .util import remove_meilisearch_disallowed_chars


class URLContextProvider(ContextProvider):
    """Type '@url' to reference the contents of a URL. You can either reference preset URLs, or reference one dynamically by typing '@url https://example.com'. The text contents of the page will be fetched and used as context."""

    title = "url"
    display_title = "URL"
    description = "Reference the contents of a webpage"
    dynamic = True
    requires_query = True

    # Allows users to provide a list of preset urls
    preset_urls: List[str] = Field(
        [],
        description="A list of preset URLs that you will be able to quickly reference by typing '@url'",
    )

    # Static items loaded from preset_urls
    static_url_context_items: List[ContextItem] = []

    # There is only a single dynamic url context item, so it has a static id
    _DYNAMIC_URL_CONTEXT_ITEM_ID = "url"

    # This is a template dynamic item that will generate context item on demand
    # when get item is called
    @property
    def DYNAMIC_CONTEXT_ITEM(self):
        return ContextItem(
            content="",
            description=ContextItemDescription(
                name="Dynamic URL",
                description="Reference the contents of a webpage (e.g. '@url https://www.w3schools.com/python/python_ref_functions.asp')",
                id=ContextItemId(
                    provider_title=self.title, item_id=self._DYNAMIC_URL_CONTEXT_ITEM_ID
                ),
            ),
        )

    def static_url_context_item_from_url(self, url: str) -> ContextItem:
        content, title = self._get_url_text_contents_and_title(url)
        return ContextItem(
            content=content,
            description=ContextItemDescription(
                name=title,
                description=f"Contents of {url}",
                id=ContextItemId(
                    provider_title=self.title,
                    item_id=remove_meilisearch_disallowed_chars(url),
                ),
            ),
        )

    def _get_url_text_contents_and_title(self, url: str) -> (str, str):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        title = url.replace("https://", "").replace("http://", "").replace("www.", "")
        if soup.title is not None:
            title = soup.title.string
        return soup.get_text(), title

    async def provide_context_items(self, workspace_dir: str) -> List[ContextItem]:
        self.static_url_context_items = [
            self.static_url_context_item_from_url(url) for url in self.preset_urls
        ]

        return [self.DYNAMIC_CONTEXT_ITEM] + self.static_url_context_items

    async def get_item(self, id: ContextItemId, query: str) -> ContextItem:
        # Check if the item is a static item
        matching_static_item = next(
            (
                item
                for item in self.static_url_context_items
                if item.description.id.item_id == id.item_id
            ),
            None,
        )
        if matching_static_item:
            return matching_static_item

        # Check if the item is the dynamic item
        if not id.provider_title == self.title:
            raise Exception("Invalid provider title for item")

        # Generate the dynamic item
        url = query.lstrip("url ").strip()
        if url is None or url == "":
            return None
        content, title = self._get_url_text_contents_and_title(url)

        ctx_item = self.DYNAMIC_CONTEXT_ITEM.copy()
        ctx_item.content = content
        ctx_item.description.name = title
        ctx_item.description.id.item_id = remove_meilisearch_disallowed_chars(url)
        return ctx_item