summaryrefslogtreecommitdiff
path: root/server/continuedev/libs/chroma/update.py
blob: 7a1217f9719af728107cafe3ab17d85e64c73c39 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# import faiss
import os
import subprocess
from typing import List

from dotenv import load_dotenv
from llama_index import Document, SimpleDirectoryReader

load_dotenv()

FILE_TYPES_TO_IGNORE = [".pyc", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico"]


def filter_ignored_files(files: List[str], root_dir: str):
    """Further filter files before indexing."""
    for file in files:
        if (
            file.endswith(tuple(FILE_TYPES_TO_IGNORE))
            or file.startswith(".git")
            or file.startswith("archive")
        ):
            continue  # nice
        yield root_dir + "/" + file


def get_git_ignored_files(root_dir: str):
    """Get the list of ignored files in a Git repository."""
    try:
        output = (
            subprocess.check_output(
                ["git", "ls-files", "--ignored", "--others", "--exclude-standard"],
                cwd=root_dir,
            )
            .strip()
            .decode()
        )
        return output.split("\n")
    except subprocess.CalledProcessError:
        return []


def get_all_files(root_dir: str):
    """Get a list of all files in a directory."""
    for dir_path, _, file_names in os.walk(root_dir):
        for file_name in file_names:
            yield os.path.join(os.path.relpath(dir_path, root_dir), file_name)


def get_input_files(root_dir: str):
    """Get a list of all files in a Git repository that are not ignored."""
    ignored_files = set(get_git_ignored_files(root_dir))
    all_files = set(get_all_files(root_dir))
    nonignored_files = all_files - ignored_files
    return filter_ignored_files(nonignored_files, root_dir)


def load_gpt_index_documents(root: str) -> List[Document]:
    """Loads a list of GPTIndex Documents, respecting .gitignore files."""
    # Get input files
    input_files = get_input_files(root)
    # Use SimpleDirectoryReader to load the files into Documents
    return SimpleDirectoryReader(
        root,
        input_files=input_files,
        file_metadata=lambda filename: {"filename": filename},
    ).load_data()