diff options
Diffstat (limited to 'extension/scripts')
| -rw-r--r-- | extension/scripts/.gitignore | 3 | ||||
| -rw-r--r-- | extension/scripts/README.md | 5 | ||||
| -rw-r--r-- | extension/scripts/chroma.py | 152 | ||||
| -rw-r--r-- | extension/scripts/continuedev-0.1.0-py3-none-any.whl | bin | 0 -> 61142 bytes | |||
| -rw-r--r-- | extension/scripts/index.py | 52 | ||||
| -rw-r--r-- | extension/scripts/query.py | 63 | ||||
| -rw-r--r-- | extension/scripts/replace.py | 17 | ||||
| -rw-r--r-- | extension/scripts/requirements.txt | 6 | ||||
| -rw-r--r-- | extension/scripts/run_continue_server.py | 4 | ||||
| -rw-r--r-- | extension/scripts/typegen.js | 53 | ||||
| -rw-r--r-- | extension/scripts/update.py | 185 | 
11 files changed, 540 insertions, 0 deletions
diff --git a/extension/scripts/.gitignore b/extension/scripts/.gitignore new file mode 100644 index 00000000..7af27c08 --- /dev/null +++ b/extension/scripts/.gitignore @@ -0,0 +1,3 @@ +testdb +env +stdout.txt
\ No newline at end of file diff --git a/extension/scripts/README.md b/extension/scripts/README.md new file mode 100644 index 00000000..0f851cb4 --- /dev/null +++ b/extension/scripts/README.md @@ -0,0 +1,5 @@ +# Scripts + +Whenever we need python to run on the client side, we include a script file at the top level of this folder. All other files that are not to be run directly as a script (utility files) should be in a subfolder of `scripts`. You can call one of these scripts from the VSCode extension using the `runPythonScript` function in `bridge.ts`. + +When the extension is activated (`activate` function in `src/extension.ts`), we call `setupPythonEnv`, which makes the virtual environment and downloads all the necessary requirements as given in `requirements.txt`. With this in mind, be sure to run `pip freeze > requirements.txt` whenever you add a new requirement. diff --git a/extension/scripts/chroma.py b/extension/scripts/chroma.py new file mode 100644 index 00000000..7425394e --- /dev/null +++ b/extension/scripts/chroma.py @@ -0,0 +1,152 @@ +import chromadb +import os +import json +import subprocess + +from typing import List, Tuple + +from chromadb.config import Settings + +client = chromadb.Client(Settings( +    chroma_db_impl="duckdb+parquet", +    persist_directory="./data/" +)) + +FILE_TYPES_TO_IGNORE = [ +    '.pyc', +    '.png', +    '.jpg', +    '.jpeg', +    '.gif', +    '.svg', +    '.ico' +] + +def further_filter(files: List[str], root_dir: str): +    """Further filter files before indexing.""" +    for file in files: +        if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'): +            continue +        yield root_dir + "/" + file + +def get_git_root_dir(path: str): +    """Get the root directory of a Git repository.""" +    try: +        return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode() +    except subprocess.CalledProcessError: +        return None + +def get_git_ignored_files(root_dir: str): +    """Get the list of ignored files in a Git repository.""" +    try: +        output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode() +        return output.split('\n') +    except subprocess.CalledProcessError: +        return [] + +def get_all_files(root_dir: str): +    """Get a list of all files in a directory.""" +    for dir_path, _, file_names in os.walk(root_dir): +        for file_name in file_names: +            yield os.path.join(os.path.relpath(dir_path, root_dir), file_name) + +def get_input_files(root_dir: str): +    """Get a list of all files in a Git repository that are not ignored.""" +    ignored_files = set(get_git_ignored_files(root_dir)) +    all_files = set(get_all_files(root_dir)) +    nonignored_files = all_files - ignored_files +    return further_filter(nonignored_files, root_dir) + +def get_git_root_dir(cwd: str): +    """Get the root directory of a Git repository.""" +    result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) +    return result.stdout.decode().strip() + +def get_current_branch(cwd: str) -> str: +    """Get the current Git branch.""" +    try: +        return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=cwd).decode("utf-8").strip() +    except: +        return "main" + +def get_current_commit(cwd: str) -> str: +    try: +        return subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd).decode("utf-8").strip() +    except: +        return "NO_COMMITS" + +def get_modified_deleted_files(cwd: str) -> Tuple[List[str], List[str]]: +    """Get a list of all files that have been modified since the last commit.""" +    branch = get_current_branch(cwd) +    current_commit = get_current_commit(cwd) + +    with open(f"./data/{branch}.json", 'r') as f: +        previous_commit = json.load(f)["commit"] + +    modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit], cwd=cwd).decode("utf-8").strip() +    modified_deleted_files = modified_deleted_files.split("\n") +    modified_deleted_files = [f for f in modified_deleted_files if f] + +    root = get_git_root_dir(cwd) +    deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)] +    modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" +  f)] + +    return further_filter(modified_files, root), further_filter(deleted_files, root) + +def create_collection(branch: str, cwd: str): +    """Create a new collection, returning whether it already existed.""" +    try: +        collection = client.create_collection(name=branch) +    except Exception as e: +        print(e) +        return + +    files = get_input_files(get_git_root_dir(cwd)) +    for file in files: +        with open(file, 'r') as f: +            collection.add(documents=[f.read()], ids=[file]) +        print(f"Added {file}") +    with open(f"./data/{branch}.json", 'w') as f: +        json.dump({"commit": get_current_commit(cwd)}, f) + +def collection_exists(cwd: str): +    """Check if a collection exists.""" +    branch = get_current_branch(cwd) +    return branch in client.list_collections() + +def update_collection(cwd: str): +    """Update the collection.""" +    branch = get_current_branch(cwd) + +    try: + +        collection = client.get_collection(branch) +         +        modified_files, deleted_files = get_modified_deleted_files(cwd) + +        for file in deleted_files: +            collection.delete(ids=[file]) +            print(f"Deleted {file}") +         +        for file in modified_files: +            with open(file, 'r') as f: +                collection.update(documents=[f.read()], ids=[file]) +            print(f"Updated {file}") +         +        with open(f"./data/{branch}.json", 'w') as f: +            json.dump({"commit": get_current_commit(cwd)}, f) + +    except: + +        create_collection(branch, cwd) + +def query_collection(query: str, n_results: int, cwd: str): +    """Query the collection.""" +    branch = get_current_branch(cwd) +    try: +        collection = client.get_collection(branch) +    except: +        create_collection(branch, cwd) +        collection = client.get_collection(branch) +    results = collection.query(query_texts=[query], n_results=n_results) +    return results
\ No newline at end of file diff --git a/extension/scripts/continuedev-0.1.0-py3-none-any.whl b/extension/scripts/continuedev-0.1.0-py3-none-any.whl Binary files differnew file mode 100644 index 00000000..15787c59 --- /dev/null +++ b/extension/scripts/continuedev-0.1.0-py3-none-any.whl diff --git a/extension/scripts/index.py b/extension/scripts/index.py new file mode 100644 index 00000000..3afc9131 --- /dev/null +++ b/extension/scripts/index.py @@ -0,0 +1,52 @@ +import sys +import os +from typing import TextIO +from chroma import update_collection, query_collection, create_collection, collection_exists, get_current_branch +from typer import Typer + +app = Typer() + +class SilenceStdoutContextManager: +    saved_stdout: TextIO + +    def __enter__(self): +        self._original_stdout = sys.stdout +        sys.stdout = open(os.devnull, 'w') + +    def __exit__(self, exc_type, exc_val, exc_tb): +        sys.stdout.close() +        sys.stdout = self._original_stdout + +silence = SilenceStdoutContextManager() + +@app.command("exists") +def exists(cwd: str): +    with silence: +        exists = collection_exists(cwd) +    print({"exists": exists}) + +@app.command("create") +def create(cwd: str): +    with silence: +        branch = get_current_branch(cwd) +        create_collection(branch, cwd) +    print({"success": True}) + +@app.command("update") +def update(cwd: str): +    with silence: +        update_collection(cwd) +    print({"success": True}) + +@app.command("query") +def query(query: str, n_results: int, cwd: str): +    with silence: +        resp = query_collection(query, n_results, cwd) +    results = [{ +        "id": resp["ids"][0][i], +        "document": resp["documents"][0][i] +    } for i in range(len(resp["ids"][0]))] +    print({"results": results}) + +if __name__ == "__main__": +    app()
\ No newline at end of file diff --git a/extension/scripts/query.py b/extension/scripts/query.py new file mode 100644 index 00000000..f2e44413 --- /dev/null +++ b/extension/scripts/query.py @@ -0,0 +1,63 @@ +import subprocess +import sys +from gpt_index import GPTSimpleVectorIndex, GPTFaissIndex +import os +from typer import Typer +from enum import Enum +from update import update_codebase_index, create_codebase_index, index_dir_for, get_current_branch +from replace import replace_additional_index + +app = Typer() + +def query_codebase_index(query: str) -> str: +    """Query the codebase index.""" +    branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip() +    path = 'data/{branch}/index.json' +    if not os.path.exists(path): +        print("No index found for the codebase") +        return "" +    index = GPTFaissIndex.load_from_disk(path) +    return index.query(query) + +def query_additional_index(query: str) -> str: +    """Query the additional index.""" +    index = GPTSimpleVectorIndex.load_from_disk('data/additional_index.json') +    return index.query(query) + +class IndexTypeOption(str, Enum): +    codebase = "codebase" +    additional = "additional" + +@app.command() +def query(context: IndexTypeOption, query: str): +    if context == IndexTypeOption.additional: +        response = query_additional_index(query) +    elif context == IndexTypeOption.codebase: +        response = query_codebase_index(query) +    else: +        print("Error: unknown context") +    print({ "response": response }) + +@app.command() +def check_index_exists(root_path: str): +    branch = get_current_branch() +    exists = os.path.exists(index_dir_for(branch)) +    print({ "exists": exists }) + +@app.command() +def update(): +    update_codebase_index() +    print("Updated codebase index") + +@app.command() +def create_index(path: str): +    create_codebase_index() +    print("Created file index") + +@app.command() +def replace_additional_index(info: str): +    replace_additional_index() +    print("Replaced additional index") + +if __name__ == '__main__': +    app()
\ No newline at end of file diff --git a/extension/scripts/replace.py b/extension/scripts/replace.py new file mode 100644 index 00000000..08810243 --- /dev/null +++ b/extension/scripts/replace.py @@ -0,0 +1,17 @@ +import sys +from gpt_index import GPTSimpleVectorIndex, Document + +def replace_additional_index(info: str): +    """Replace the additional index.""" +    with open('data/additional_context.txt', 'w') as f: +        f.write(info) +    documents = [Document(info)] +    index = GPTSimpleVectorIndex(documents) +    index.save_to_disk('data/additional_index.json') +    print("Additional index replaced") + +if __name__ == "__main__": +    """python3 replace.py <info>""" +    info = sys.argv[1] if len(sys.argv) > 1 else None +    if info: +        replace_additional_index(info)
\ No newline at end of file diff --git a/extension/scripts/requirements.txt b/extension/scripts/requirements.txt new file mode 100644 index 00000000..27b48444 --- /dev/null +++ b/extension/scripts/requirements.txt @@ -0,0 +1,6 @@ +# chromadb==0.3.10 +# pathspec==0.11.0 +# typer==0.7.0 +# pydantic +# pytest +./continuedev-0.1.0-py3-none-any.whl
\ No newline at end of file diff --git a/extension/scripts/run_continue_server.py b/extension/scripts/run_continue_server.py new file mode 100644 index 00000000..089cc54d --- /dev/null +++ b/extension/scripts/run_continue_server.py @@ -0,0 +1,4 @@ +from continuedev.server.main import run_server + +if __name__ == "__main__": +    run_server() diff --git a/extension/scripts/typegen.js b/extension/scripts/typegen.js new file mode 100644 index 00000000..0bbff19e --- /dev/null +++ b/extension/scripts/typegen.js @@ -0,0 +1,53 @@ +const fs = require("fs"); +const path = require("path"); +const { compile } = require("json-schema-to-typescript"); + +function generateTypesForFile(inputPath, outputPath) { +  let schema = JSON.parse(fs.readFileSync(inputPath, "utf8")); +  let name = (inputPath.split("/").pop() || inputPath).split(".")[0]; +  // This is to solve the issue of json-schema-to-typescript not supporting $ref at the top-level, which is what Pydantic generates for recursive types +  if ("$ref" in schema) { +    let temp = schema["$ref"]; +    delete schema["$ref"]; +    schema["allOf"] = [{ $ref: temp }]; +  } + +  compile(schema, name) +    .then((ts) => { +      fs.writeFileSync(path.join(outputPath, name + ".d.ts"), ts); +    }) +    .catch((e) => { +      console.log("Error generating types for " + name); +      throw e; +    }); +} + +function generateAllSchemas(inputDir, outputDir) { +  // get the current directory +  try { +    fs.readdirSync(inputDir).forEach((file) => { +      if (file.endsWith(".json")) { +        generateTypesForFile(path.join(inputDir, file), outputDir); +      } +    }); +  } catch (e) { +    console.log( +      "Make sure you are running this script from the extension/ directory." +    ); +    throw e; +  } +} + +function deleteAllInDir(dir) { +  fs.readdirSync(dir).forEach((file) => { +    if (file.endsWith(".d.ts")) { +      fs.unlinkSync(path.join(dir, file)); +    } +  }); +} + +OUTPUT_DIR = "schema"; +INPUT_DIR = "../schema/json"; + +deleteAllInDir(OUTPUT_DIR); +generateAllSchemas(INPUT_DIR, OUTPUT_DIR); diff --git a/extension/scripts/update.py b/extension/scripts/update.py new file mode 100644 index 00000000..15ad6ac0 --- /dev/null +++ b/extension/scripts/update.py @@ -0,0 +1,185 @@ +# import faiss +import json +import os +import subprocess + +from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter +from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, Document, GPTFaissIndex +from typing import List, Generator, Tuple + +FILE_TYPES_TO_IGNORE = [ +    '.pyc', +    '.png', +    '.jpg', +    '.jpeg', +    '.gif', +    '.svg', +    '.ico' +] + +def further_filter(files: List[str], root_dir: str): +    """Further filter files before indexing.""" +    for file in files: +        if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'): +            continue +        yield root_dir + "/" + file + +def get_git_root_dir(path: str): +    """Get the root directory of a Git repository.""" +    try: +        return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode() +    except subprocess.CalledProcessError: +        return None + +def get_git_ignored_files(root_dir: str): +    """Get the list of ignored files in a Git repository.""" +    try: +        output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode() +        return output.split('\n') +    except subprocess.CalledProcessError: +        return [] + +def get_all_files(root_dir: str): +    """Get a list of all files in a directory.""" +    for dir_path, _, file_names in os.walk(root_dir): +        for file_name in file_names: +            yield os.path.join(os.path.relpath(dir_path, root_dir), file_name) + +def get_input_files(root_dir: str): +    """Get a list of all files in a Git repository that are not ignored.""" +    ignored_files = set(get_git_ignored_files(root_dir)) +    all_files = set(get_all_files(root_dir)) +    nonignored_files = all_files - ignored_files +    return further_filter(nonignored_files, root_dir) + +def load_gpt_index_documents(root: str) -> List[Document]: +    """Loads a list of GPTIndex Documents, respecting .gitignore files.""" +    # Get input files +    input_files = get_input_files(root) +    # Use SimpleDirectoryReader to load the files into Documents +    return SimpleDirectoryReader(root, input_files=input_files, file_metadata=lambda filename: {"filename": filename}).load_data() + +def index_dir_for(branch: str) -> str: +    return f"data/{branch}" + +def get_git_root_dir(): +    result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) +    return result.stdout.decode().strip() + +def get_current_branch() -> str: +    return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip() + +def get_current_commit() -> str: +    return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() + +def create_codebase_index(): +    """Create a new index for the current branch.""" +    branch = get_current_branch() +    if not os.path.exists(index_dir_for(branch)): +        os.makedirs(index_dir_for(branch)) + +    documents = load_gpt_index_documents(get_git_root_dir()) +     +    chunks = {} +    doc_chunks = [] +    for doc in documents: +        text_splitter = TokenTextSplitter() +        text_chunks = text_splitter.split_text(doc.text) +        filename = doc.extra_info["filename"] +        chunks[filename] = len(text_chunks) +        for i, text in enumerate(text_chunks): +            doc_chunks.append(Document(text, doc_id=f"{filename}::{i}")) + +    with open(f"{index_dir_for(branch)}/metadata.json", "w") as f: +        json.dump({"commit": get_current_commit(), "chunks" : chunks}, f, indent=4) + +    index = GPTSimpleVectorIndex([]) +    for chunk in doc_chunks: +        index.insert(chunk) + +    # d = 1536 # Dimension of text-ada-embedding-002 +    # faiss_index = faiss.IndexFlatL2(d) +    # index = GPTFaissIndex(documents, faiss_index=faiss_index) +    # index.save_to_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index") + +    index.save_to_disk(f"{index_dir_for(branch)}/index.json") + +    print("Codebase index created") + +def get_modified_deleted_files() -> Tuple[List[str], List[str]]: +    """Get a list of all files that have been modified since the last commit.""" +    branch = get_current_branch() +    current_commit = get_current_commit() + +    metadata = f"{index_dir_for(branch)}/metadata.json" +    with open(metadata, "r") as f: +        previous_commit = json.load(f)["commit"] + +    modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit]).decode("utf-8").strip() +    modified_deleted_files = modified_deleted_files.split("\n") +    modified_deleted_files = [f for f in modified_deleted_files if f] + +    root = get_git_root_dir() +    deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)] +    modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" +  f)] + +    return further_filter(modified_files, index_dir_for(branch)), further_filter(deleted_files, index_dir_for(branch)) + +def update_codebase_index(): +    """Update the index with a list of files.""" +    branch = get_current_branch() + +    if not os.path.exists(index_dir_for(branch)): +        create_codebase_index() +    else: +        # index = GPTFaissIndex.load_from_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index") +        index = GPTSimpleVectorIndex.load_from_disk(f"{index_dir_for(branch)}/index.json") +        modified_files, deleted_files = get_modified_deleted_files() + +        with open(f"{index_dir_for(branch)}/metadata.json", "r") as f: +            metadata = json.load(f) + +        for file in deleted_files: +             +            num_chunks = metadata["chunks"][file] +            for i in range(num_chunks): +                index.delete(f"{file}::{i}") + +            del metadata["chunks"][file] + +            print(f"Deleted {file}") + +        for file in modified_files: + +            if file in metadata["chunks"]: + +                num_chunks = metadata["chunks"][file] + +                for i in range(num_chunks): +                    index.delete(f"{file}::{i}") + +                print(f"Deleted old version of {file}") + +            with open(file, "r") as f: +                text = f.read() + +            text_splitter = TokenTextSplitter() +            text_chunks = text_splitter.split_text(text) +             +            for i, text in enumerate(text_chunks): +                index.insert(Document(text, doc_id=f"{file}::{i}")) +             +            metadata["chunks"][file] = len(text_chunks) + +            print(f"Inserted new version of {file}")         + +        metadata["commit"] = get_current_commit() + +        with open(f"{index_dir_for(branch)}/metadata.json", "w") as f: +            json.dump(metadata, f, indent=4) +         +        print("Codebase index updated") + +if __name__ == "__main__": +    """python3 update.py""" +    update_codebase_index()
\ No newline at end of file  | 
