copying from old repo

author: Nate Sesti <sestinj@gmail.com> 2023-05-23 23:45:12 -0400
committer: Nate Sesti <sestinj@gmail.com> 2023-05-23 23:45:12 -0400
commit: f53768612b1e2268697b5444e502032ef9f3fb3c (patch)
tree: 4ed49b73e6bd3c2f8fceffa9643973033f87af95 /extension/scripts
download: sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.tar.gz
sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.tar.bz2
sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.zip
11 files changed, 540 insertions, 0 deletions
diff --git a/extension/scripts/.gitignore b/extension/scripts/.gitignore
new file mode 100644
index 00000000..7af27c08
--- /dev/null
+++ b/extension/scripts/.gitignore
@@ -0,0 +1,3 @@
+testdb
+env
+stdout.txt
+\ No newline at end of file
diff --git a/extension/scripts/README.md b/extension/scripts/README.md
new file mode 100644
index 00000000..0f851cb4
--- /dev/null
+++ b/extension/scripts/README.md
@@ -0,0 +1,5 @@
+# Scripts
+
+Whenever we need python to run on the client side, we include a script file at the top level of this folder. All other files that are not to be run directly as a script (utility files) should be in a subfolder of `scripts`. You can call one of these scripts from the VSCode extension using the `runPythonScript` function in `bridge.ts`.
+
+When the extension is activated (`activate` function in `src/extension.ts`), we call `setupPythonEnv`, which makes the virtual environment and downloads all the necessary requirements as given in `requirements.txt`. With this in mind, be sure to run `pip freeze > requirements.txt` whenever you add a new requirement.
diff --git a/extension/scripts/chroma.py b/extension/scripts/chroma.py
new file mode 100644
index 00000000..7425394e
--- /dev/null
+++ b/extension/scripts/chroma.py
@@ -0,0 +1,152 @@
+import chromadb
+import os
+import json
+import subprocess
+
+from typing import List, Tuple
+
+from chromadb.config import Settings
+
+client = chromadb.Client(Settings(
+    chroma_db_impl="duckdb+parquet",
+    persist_directory="./data/"
+))
+
+FILE_TYPES_TO_IGNORE = [
+    '.pyc',
+    '.png',
+    '.jpg',
+    '.jpeg',
+    '.gif',
+    '.svg',
+    '.ico'
+]
+
+def further_filter(files: List[str], root_dir: str):
+    """Further filter files before indexing."""
+    for file in files:
+        if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'):
+            continue
+        yield root_dir + "/" + file
+
+def get_git_root_dir(path: str):
+    """Get the root directory of a Git repository."""
+    try:
+        return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode()
+    except subprocess.CalledProcessError:
+        return None
+
+def get_git_ignored_files(root_dir: str):
+    """Get the list of ignored files in a Git repository."""
+    try:
+        output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode()
+        return output.split('\n')
+    except subprocess.CalledProcessError:
+        return []
+
+def get_all_files(root_dir: str):
+    """Get a list of all files in a directory."""
+    for dir_path, _, file_names in os.walk(root_dir):
+        for file_name in file_names:
+            yield os.path.join(os.path.relpath(dir_path, root_dir), file_name)
+
+def get_input_files(root_dir: str):
+    """Get a list of all files in a Git repository that are not ignored."""
+    ignored_files = set(get_git_ignored_files(root_dir))
+    all_files = set(get_all_files(root_dir))
+    nonignored_files = all_files - ignored_files
+    return further_filter(nonignored_files, root_dir)
+
+def get_git_root_dir(cwd: str):
+    """Get the root directory of a Git repository."""
+    result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd)
+    return result.stdout.decode().strip()
+
+def get_current_branch(cwd: str) -> str:
+    """Get the current Git branch."""
+    try:
+        return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=cwd).decode("utf-8").strip()
+    except:
+        return "main"
+
+def get_current_commit(cwd: str) -> str:
+    try:
+        return subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd).decode("utf-8").strip()
+    except:
+        return "NO_COMMITS"
+
+def get_modified_deleted_files(cwd: str) -> Tuple[List[str], List[str]]:
+    """Get a list of all files that have been modified since the last commit."""
+    branch = get_current_branch(cwd)
+    current_commit = get_current_commit(cwd)
+
+    with open(f"./data/{branch}.json", 'r') as f:
+        previous_commit = json.load(f)["commit"]
+
+    modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit], cwd=cwd).decode("utf-8").strip()
+    modified_deleted_files = modified_deleted_files.split("\n")
+    modified_deleted_files = [f for f in modified_deleted_files if f]
+
+    root = get_git_root_dir(cwd)
+    deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)]
+    modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" +  f)]
+
+    return further_filter(modified_files, root), further_filter(deleted_files, root)
+
+def create_collection(branch: str, cwd: str):
+    """Create a new collection, returning whether it already existed."""
+    try:
+        collection = client.create_collection(name=branch)
+    except Exception as e:
+        print(e)
+        return
+
+    files = get_input_files(get_git_root_dir(cwd))
+    for file in files:
+        with open(file, 'r') as f:
+            collection.add(documents=[f.read()], ids=[file])
+        print(f"Added {file}")
+    with open(f"./data/{branch}.json", 'w') as f:
+        json.dump({"commit": get_current_commit(cwd)}, f)
+
+def collection_exists(cwd: str):
+    """Check if a collection exists."""
+    branch = get_current_branch(cwd)
+    return branch in client.list_collections()
+
+def update_collection(cwd: str):
+    """Update the collection."""
+    branch = get_current_branch(cwd)
+
+    try:
+
+        collection = client.get_collection(branch)
+        
+        modified_files, deleted_files = get_modified_deleted_files(cwd)
+
+        for file in deleted_files:
+            collection.delete(ids=[file])
+            print(f"Deleted {file}")
+        
+        for file in modified_files:
+            with open(file, 'r') as f:
+                collection.update(documents=[f.read()], ids=[file])
+            print(f"Updated {file}")
+        
+        with open(f"./data/{branch}.json", 'w') as f:
+            json.dump({"commit": get_current_commit(cwd)}, f)
+
+    except:
+
+        create_collection(branch, cwd)
+
+def query_collection(query: str, n_results: int, cwd: str):
+    """Query the collection."""
+    branch = get_current_branch(cwd)
+    try:
+        collection = client.get_collection(branch)
+    except:
+        create_collection(branch, cwd)
+        collection = client.get_collection(branch)
+    results = collection.query(query_texts=[query], n_results=n_results)
+    return results
+\ No newline at end of file
diff --git a/extension/scripts/continuedev-0.1.0-py3-none-any.whl b/extension/scripts/continuedev-0.1.0-py3-none-any.whl
new file mode 100644
index 00000000..15787c59
--- /dev/null
+++ b/extension/scripts/continuedev-0.1.0-py3-none-any.whl
diff --git a/extension/scripts/index.py b/extension/scripts/index.py
new file mode 100644
index 00000000..3afc9131
--- /dev/null
+++ b/extension/scripts/index.py
@@ -0,0 +1,52 @@
+import sys
+import os
+from typing import TextIO
+from chroma import update_collection, query_collection, create_collection, collection_exists, get_current_branch
+from typer import Typer
+
+app = Typer()
+
+class SilenceStdoutContextManager:
+    saved_stdout: TextIO
+
+    def __enter__(self):
+        self._original_stdout = sys.stdout
+        sys.stdout = open(os.devnull, 'w')
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stdout.close()
+        sys.stdout = self._original_stdout
+
+silence = SilenceStdoutContextManager()
+
+@app.command("exists")
+def exists(cwd: str):
+    with silence:
+        exists = collection_exists(cwd)
+    print({"exists": exists})
+
+@app.command("create")
+def create(cwd: str):
+    with silence:
+        branch = get_current_branch(cwd)
+        create_collection(branch, cwd)
+    print({"success": True})
+
+@app.command("update")
+def update(cwd: str):
+    with silence:
+        update_collection(cwd)
+    print({"success": True})
+
+@app.command("query")
+def query(query: str, n_results: int, cwd: str):
+    with silence:
+        resp = query_collection(query, n_results, cwd)
+    results = [{
+        "id": resp["ids"][0][i],
+        "document": resp["documents"][0][i]
+    } for i in range(len(resp["ids"][0]))]
+    print({"results": results})
+
+if __name__ == "__main__":
+    app()
+\ No newline at end of file
diff --git a/extension/scripts/query.py b/extension/scripts/query.py
new file mode 100644
index 00000000..f2e44413
--- /dev/null
+++ b/extension/scripts/query.py
@@ -0,0 +1,63 @@
+import subprocess
+import sys
+from gpt_index import GPTSimpleVectorIndex, GPTFaissIndex
+import os
+from typer import Typer
+from enum import Enum
+from update import update_codebase_index, create_codebase_index, index_dir_for, get_current_branch
+from replace import replace_additional_index
+
+app = Typer()
+
+def query_codebase_index(query: str) -> str:
+    """Query the codebase index."""
+    branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip()
+    path = 'data/{branch}/index.json'
+    if not os.path.exists(path):
+        print("No index found for the codebase")
+        return ""
+    index = GPTFaissIndex.load_from_disk(path)
+    return index.query(query)
+
+def query_additional_index(query: str) -> str:
+    """Query the additional index."""
+    index = GPTSimpleVectorIndex.load_from_disk('data/additional_index.json')
+    return index.query(query)
+
+class IndexTypeOption(str, Enum):
+    codebase = "codebase"
+    additional = "additional"
+
+@app.command()
+def query(context: IndexTypeOption, query: str):
+    if context == IndexTypeOption.additional:
+        response = query_additional_index(query)
+    elif context == IndexTypeOption.codebase:
+        response = query_codebase_index(query)
+    else:
+        print("Error: unknown context")
+    print({ "response": response })
+
+@app.command()
+def check_index_exists(root_path: str):
+    branch = get_current_branch()
+    exists = os.path.exists(index_dir_for(branch))
+    print({ "exists": exists })
+
+@app.command()
+def update():
+    update_codebase_index()
+    print("Updated codebase index")
+
+@app.command()
+def create_index(path: str):
+    create_codebase_index()
+    print("Created file index")
+
+@app.command()
+def replace_additional_index(info: str):
+    replace_additional_index()
+    print("Replaced additional index")
+
+if __name__ == '__main__':
+    app()
+\ No newline at end of file
diff --git a/extension/scripts/replace.py b/extension/scripts/replace.py
new file mode 100644
index 00000000..08810243
--- /dev/null
+++ b/extension/scripts/replace.py
@@ -0,0 +1,17 @@
+import sys
+from gpt_index import GPTSimpleVectorIndex, Document
+
+def replace_additional_index(info: str):
+    """Replace the additional index."""
+    with open('data/additional_context.txt', 'w') as f:
+        f.write(info)
+    documents = [Document(info)]
+    index = GPTSimpleVectorIndex(documents)
+    index.save_to_disk('data/additional_index.json')
+    print("Additional index replaced")
+
+if __name__ == "__main__":
+    """python3 replace.py <info>"""
+    info = sys.argv[1] if len(sys.argv) > 1 else None
+    if info:
+        replace_additional_index(info)
+\ No newline at end of file
diff --git a/extension/scripts/requirements.txt b/extension/scripts/requirements.txt
new file mode 100644
index 00000000..27b48444
--- /dev/null
+++ b/extension/scripts/requirements.txt
@@ -0,0 +1,6 @@
+# chromadb==0.3.10
+# pathspec==0.11.0
+# typer==0.7.0
+# pydantic
+# pytest
+./continuedev-0.1.0-py3-none-any.whl
+\ No newline at end of file
diff --git a/extension/scripts/run_continue_server.py b/extension/scripts/run_continue_server.py
new file mode 100644
index 00000000..089cc54d
--- /dev/null
+++ b/extension/scripts/run_continue_server.py
@@ -0,0 +1,4 @@
+from continuedev.server.main import run_server
+
+if __name__ == "__main__":
+    run_server()
diff --git a/extension/scripts/typegen.js b/extension/scripts/typegen.js
new file mode 100644
index 00000000..0bbff19e
--- /dev/null
+++ b/extension/scripts/typegen.js
@@ -0,0 +1,53 @@
+const fs = require("fs");
+const path = require("path");
+const { compile } = require("json-schema-to-typescript");
+
+function generateTypesForFile(inputPath, outputPath) {
+  let schema = JSON.parse(fs.readFileSync(inputPath, "utf8"));
+  let name = (inputPath.split("/").pop() || inputPath).split(".")[0];
+  // This is to solve the issue of json-schema-to-typescript not supporting $ref at the top-level, which is what Pydantic generates for recursive types
+  if ("$ref" in schema) {
+    let temp = schema["$ref"];
+    delete schema["$ref"];
+    schema["allOf"] = [{ $ref: temp }];
+  }
+
+  compile(schema, name)
+    .then((ts) => {
+      fs.writeFileSync(path.join(outputPath, name + ".d.ts"), ts);
+    })
+    .catch((e) => {
+      console.log("Error generating types for " + name);
+      throw e;
+    });
+}
+
+function generateAllSchemas(inputDir, outputDir) {
+  // get the current directory
+  try {
+    fs.readdirSync(inputDir).forEach((file) => {
+      if (file.endsWith(".json")) {
+        generateTypesForFile(path.join(inputDir, file), outputDir);
+      }
+    });
+  } catch (e) {
+    console.log(
+      "Make sure you are running this script from the extension/ directory."
+    );
+    throw e;
+  }
+}
+
+function deleteAllInDir(dir) {
+  fs.readdirSync(dir).forEach((file) => {
+    if (file.endsWith(".d.ts")) {
+      fs.unlinkSync(path.join(dir, file));
+    }
+  });
+}
+
+OUTPUT_DIR = "schema";
+INPUT_DIR = "../schema/json";
+
+deleteAllInDir(OUTPUT_DIR);
+generateAllSchemas(INPUT_DIR, OUTPUT_DIR);
diff --git a/extension/scripts/update.py b/extension/scripts/update.py
new file mode 100644
index 00000000..15ad6ac0
--- /dev/null
+++ b/extension/scripts/update.py
@@ -0,0 +1,185 @@
+# import faiss
+import json
+import os
+import subprocess
+
+from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter
+from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, Document, GPTFaissIndex
+from typing import List, Generator, Tuple
+
+FILE_TYPES_TO_IGNORE = [
+    '.pyc',
+    '.png',
+    '.jpg',
+    '.jpeg',
+    '.gif',
+    '.svg',
+    '.ico'
+]
+
+def further_filter(files: List[str], root_dir: str):
+    """Further filter files before indexing."""
+    for file in files:
+        if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'):
+            continue
+        yield root_dir + "/" + file
+
+def get_git_root_dir(path: str):
+    """Get the root directory of a Git repository."""
+    try:
+        return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode()
+    except subprocess.CalledProcessError:
+        return None
+
+def get_git_ignored_files(root_dir: str):
+    """Get the list of ignored files in a Git repository."""
+    try:
+        output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode()
+        return output.split('\n')
+    except subprocess.CalledProcessError:
+        return []
+
+def get_all_files(root_dir: str):
+    """Get a list of all files in a directory."""
+    for dir_path, _, file_names in os.walk(root_dir):
+        for file_name in file_names:
+            yield os.path.join(os.path.relpath(dir_path, root_dir), file_name)
+
+def get_input_files(root_dir: str):
+    """Get a list of all files in a Git repository that are not ignored."""
+    ignored_files = set(get_git_ignored_files(root_dir))
+    all_files = set(get_all_files(root_dir))
+    nonignored_files = all_files - ignored_files
+    return further_filter(nonignored_files, root_dir)
+
+def load_gpt_index_documents(root: str) -> List[Document]:
+    """Loads a list of GPTIndex Documents, respecting .gitignore files."""
+    # Get input files
+    input_files = get_input_files(root)
+    # Use SimpleDirectoryReader to load the files into Documents
+    return SimpleDirectoryReader(root, input_files=input_files, file_metadata=lambda filename: {"filename": filename}).load_data()
+
+def index_dir_for(branch: str) -> str:
+    return f"data/{branch}"
+
+def get_git_root_dir():
+    result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    return result.stdout.decode().strip()
+
+def get_current_branch() -> str:
+    return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip()
+
+def get_current_commit() -> str:
+    return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+
+def create_codebase_index():
+    """Create a new index for the current branch."""
+    branch = get_current_branch()
+    if not os.path.exists(index_dir_for(branch)):
+        os.makedirs(index_dir_for(branch))
+
+    documents = load_gpt_index_documents(get_git_root_dir())
+    
+    chunks = {}
+    doc_chunks = []
+    for doc in documents:
+        text_splitter = TokenTextSplitter()
+        text_chunks = text_splitter.split_text(doc.text)
+        filename = doc.extra_info["filename"]
+        chunks[filename] = len(text_chunks)
+        for i, text in enumerate(text_chunks):
+            doc_chunks.append(Document(text, doc_id=f"{filename}::{i}"))
+
+    with open(f"{index_dir_for(branch)}/metadata.json", "w") as f:
+        json.dump({"commit": get_current_commit(), "chunks" : chunks}, f, indent=4)
+
+    index = GPTSimpleVectorIndex([])
+    for chunk in doc_chunks:
+        index.insert(chunk)
+
+    # d = 1536 # Dimension of text-ada-embedding-002
+    # faiss_index = faiss.IndexFlatL2(d)
+    # index = GPTFaissIndex(documents, faiss_index=faiss_index)
+    # index.save_to_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index")
+
+    index.save_to_disk(f"{index_dir_for(branch)}/index.json")
+
+    print("Codebase index created")
+
+def get_modified_deleted_files() -> Tuple[List[str], List[str]]:
+    """Get a list of all files that have been modified since the last commit."""
+    branch = get_current_branch()
+    current_commit = get_current_commit()
+
+    metadata = f"{index_dir_for(branch)}/metadata.json"
+    with open(metadata, "r") as f:
+        previous_commit = json.load(f)["commit"]
+
+    modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit]).decode("utf-8").strip()
+    modified_deleted_files = modified_deleted_files.split("\n")
+    modified_deleted_files = [f for f in modified_deleted_files if f]
+
+    root = get_git_root_dir()
+    deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)]
+    modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" +  f)]
+
+    return further_filter(modified_files, index_dir_for(branch)), further_filter(deleted_files, index_dir_for(branch))
+
+def update_codebase_index():
+    """Update the index with a list of files."""
+    branch = get_current_branch()
+
+    if not os.path.exists(index_dir_for(branch)):
+        create_codebase_index()
+    else:
+        # index = GPTFaissIndex.load_from_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index")
+        index = GPTSimpleVectorIndex.load_from_disk(f"{index_dir_for(branch)}/index.json")
+        modified_files, deleted_files = get_modified_deleted_files()
+
+        with open(f"{index_dir_for(branch)}/metadata.json", "r") as f:
+            metadata = json.load(f)
+
+        for file in deleted_files:
+            
+            num_chunks = metadata["chunks"][file]
+            for i in range(num_chunks):
+                index.delete(f"{file}::{i}")
+
+            del metadata["chunks"][file]
+
+            print(f"Deleted {file}")
+
+        for file in modified_files:
+
+            if file in metadata["chunks"]:
+
+                num_chunks = metadata["chunks"][file]
+
+                for i in range(num_chunks):
+                    index.delete(f"{file}::{i}")
+
+                print(f"Deleted old version of {file}")
+
+            with open(file, "r") as f:
+                text = f.read()
+
+            text_splitter = TokenTextSplitter()
+            text_chunks = text_splitter.split_text(text)
+            
+            for i, text in enumerate(text_chunks):
+                index.insert(Document(text, doc_id=f"{file}::{i}"))
+            
+            metadata["chunks"][file] = len(text_chunks)
+
+            print(f"Inserted new version of {file}")        
+
+        metadata["commit"] = get_current_commit()
+
+        with open(f"{index_dir_for(branch)}/metadata.json", "w") as f:
+            json.dump(metadata, f, indent=4)
+        
+        print("Codebase index updated")
+
+if __name__ == "__main__":
+    """python3 update.py"""
+    update_codebase_index()
+\ No newline at end of file
author	Nate Sesti <sestinj@gmail.com>	2023-05-23 23:45:12 -0400
committer	Nate Sesti <sestinj@gmail.com>	2023-05-23 23:45:12 -0400
commit	f53768612b1e2268697b5444e502032ef9f3fb3c (patch)
tree	4ed49b73e6bd3c2f8fceffa9643973033f87af95 /extension/scripts
download	sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.tar.gz sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.tar.bz2 sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.zip