diff options
author | Nate Sesti <sestinj@gmail.com> | 2023-05-23 23:45:12 -0400 |
---|---|---|
committer | Nate Sesti <sestinj@gmail.com> | 2023-05-23 23:45:12 -0400 |
commit | f53768612b1e2268697b5444e502032ef9f3fb3c (patch) | |
tree | 4ed49b73e6bd3c2f8fceffa9643973033f87af95 /extension/scripts | |
download | sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.tar.gz sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.tar.bz2 sncontinue-f53768612b1e2268697b5444e502032ef9f3fb3c.zip |
copying from old repo
Diffstat (limited to 'extension/scripts')
-rw-r--r-- | extension/scripts/.gitignore | 3 | ||||
-rw-r--r-- | extension/scripts/README.md | 5 | ||||
-rw-r--r-- | extension/scripts/chroma.py | 152 | ||||
-rw-r--r-- | extension/scripts/continuedev-0.1.0-py3-none-any.whl | bin | 0 -> 61142 bytes | |||
-rw-r--r-- | extension/scripts/index.py | 52 | ||||
-rw-r--r-- | extension/scripts/query.py | 63 | ||||
-rw-r--r-- | extension/scripts/replace.py | 17 | ||||
-rw-r--r-- | extension/scripts/requirements.txt | 6 | ||||
-rw-r--r-- | extension/scripts/run_continue_server.py | 4 | ||||
-rw-r--r-- | extension/scripts/typegen.js | 53 | ||||
-rw-r--r-- | extension/scripts/update.py | 185 |
11 files changed, 540 insertions, 0 deletions
diff --git a/extension/scripts/.gitignore b/extension/scripts/.gitignore new file mode 100644 index 00000000..7af27c08 --- /dev/null +++ b/extension/scripts/.gitignore @@ -0,0 +1,3 @@ +testdb +env +stdout.txt
\ No newline at end of file diff --git a/extension/scripts/README.md b/extension/scripts/README.md new file mode 100644 index 00000000..0f851cb4 --- /dev/null +++ b/extension/scripts/README.md @@ -0,0 +1,5 @@ +# Scripts + +Whenever we need python to run on the client side, we include a script file at the top level of this folder. All other files that are not to be run directly as a script (utility files) should be in a subfolder of `scripts`. You can call one of these scripts from the VSCode extension using the `runPythonScript` function in `bridge.ts`. + +When the extension is activated (`activate` function in `src/extension.ts`), we call `setupPythonEnv`, which makes the virtual environment and downloads all the necessary requirements as given in `requirements.txt`. With this in mind, be sure to run `pip freeze > requirements.txt` whenever you add a new requirement. diff --git a/extension/scripts/chroma.py b/extension/scripts/chroma.py new file mode 100644 index 00000000..7425394e --- /dev/null +++ b/extension/scripts/chroma.py @@ -0,0 +1,152 @@ +import chromadb +import os +import json +import subprocess + +from typing import List, Tuple + +from chromadb.config import Settings + +client = chromadb.Client(Settings( + chroma_db_impl="duckdb+parquet", + persist_directory="./data/" +)) + +FILE_TYPES_TO_IGNORE = [ + '.pyc', + '.png', + '.jpg', + '.jpeg', + '.gif', + '.svg', + '.ico' +] + +def further_filter(files: List[str], root_dir: str): + """Further filter files before indexing.""" + for file in files: + if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'): + continue + yield root_dir + "/" + file + +def get_git_root_dir(path: str): + """Get the root directory of a Git repository.""" + try: + return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode() + except subprocess.CalledProcessError: + return None + +def get_git_ignored_files(root_dir: str): + """Get the list of ignored files in a Git repository.""" + try: + output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode() + return output.split('\n') + except subprocess.CalledProcessError: + return [] + +def get_all_files(root_dir: str): + """Get a list of all files in a directory.""" + for dir_path, _, file_names in os.walk(root_dir): + for file_name in file_names: + yield os.path.join(os.path.relpath(dir_path, root_dir), file_name) + +def get_input_files(root_dir: str): + """Get a list of all files in a Git repository that are not ignored.""" + ignored_files = set(get_git_ignored_files(root_dir)) + all_files = set(get_all_files(root_dir)) + nonignored_files = all_files - ignored_files + return further_filter(nonignored_files, root_dir) + +def get_git_root_dir(cwd: str): + """Get the root directory of a Git repository.""" + result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) + return result.stdout.decode().strip() + +def get_current_branch(cwd: str) -> str: + """Get the current Git branch.""" + try: + return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=cwd).decode("utf-8").strip() + except: + return "main" + +def get_current_commit(cwd: str) -> str: + try: + return subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd).decode("utf-8").strip() + except: + return "NO_COMMITS" + +def get_modified_deleted_files(cwd: str) -> Tuple[List[str], List[str]]: + """Get a list of all files that have been modified since the last commit.""" + branch = get_current_branch(cwd) + current_commit = get_current_commit(cwd) + + with open(f"./data/{branch}.json", 'r') as f: + previous_commit = json.load(f)["commit"] + + modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit], cwd=cwd).decode("utf-8").strip() + modified_deleted_files = modified_deleted_files.split("\n") + modified_deleted_files = [f for f in modified_deleted_files if f] + + root = get_git_root_dir(cwd) + deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)] + modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" + f)] + + return further_filter(modified_files, root), further_filter(deleted_files, root) + +def create_collection(branch: str, cwd: str): + """Create a new collection, returning whether it already existed.""" + try: + collection = client.create_collection(name=branch) + except Exception as e: + print(e) + return + + files = get_input_files(get_git_root_dir(cwd)) + for file in files: + with open(file, 'r') as f: + collection.add(documents=[f.read()], ids=[file]) + print(f"Added {file}") + with open(f"./data/{branch}.json", 'w') as f: + json.dump({"commit": get_current_commit(cwd)}, f) + +def collection_exists(cwd: str): + """Check if a collection exists.""" + branch = get_current_branch(cwd) + return branch in client.list_collections() + +def update_collection(cwd: str): + """Update the collection.""" + branch = get_current_branch(cwd) + + try: + + collection = client.get_collection(branch) + + modified_files, deleted_files = get_modified_deleted_files(cwd) + + for file in deleted_files: + collection.delete(ids=[file]) + print(f"Deleted {file}") + + for file in modified_files: + with open(file, 'r') as f: + collection.update(documents=[f.read()], ids=[file]) + print(f"Updated {file}") + + with open(f"./data/{branch}.json", 'w') as f: + json.dump({"commit": get_current_commit(cwd)}, f) + + except: + + create_collection(branch, cwd) + +def query_collection(query: str, n_results: int, cwd: str): + """Query the collection.""" + branch = get_current_branch(cwd) + try: + collection = client.get_collection(branch) + except: + create_collection(branch, cwd) + collection = client.get_collection(branch) + results = collection.query(query_texts=[query], n_results=n_results) + return results
\ No newline at end of file diff --git a/extension/scripts/continuedev-0.1.0-py3-none-any.whl b/extension/scripts/continuedev-0.1.0-py3-none-any.whl Binary files differnew file mode 100644 index 00000000..15787c59 --- /dev/null +++ b/extension/scripts/continuedev-0.1.0-py3-none-any.whl diff --git a/extension/scripts/index.py b/extension/scripts/index.py new file mode 100644 index 00000000..3afc9131 --- /dev/null +++ b/extension/scripts/index.py @@ -0,0 +1,52 @@ +import sys +import os +from typing import TextIO +from chroma import update_collection, query_collection, create_collection, collection_exists, get_current_branch +from typer import Typer + +app = Typer() + +class SilenceStdoutContextManager: + saved_stdout: TextIO + + def __enter__(self): + self._original_stdout = sys.stdout + sys.stdout = open(os.devnull, 'w') + + def __exit__(self, exc_type, exc_val, exc_tb): + sys.stdout.close() + sys.stdout = self._original_stdout + +silence = SilenceStdoutContextManager() + +@app.command("exists") +def exists(cwd: str): + with silence: + exists = collection_exists(cwd) + print({"exists": exists}) + +@app.command("create") +def create(cwd: str): + with silence: + branch = get_current_branch(cwd) + create_collection(branch, cwd) + print({"success": True}) + +@app.command("update") +def update(cwd: str): + with silence: + update_collection(cwd) + print({"success": True}) + +@app.command("query") +def query(query: str, n_results: int, cwd: str): + with silence: + resp = query_collection(query, n_results, cwd) + results = [{ + "id": resp["ids"][0][i], + "document": resp["documents"][0][i] + } for i in range(len(resp["ids"][0]))] + print({"results": results}) + +if __name__ == "__main__": + app()
\ No newline at end of file diff --git a/extension/scripts/query.py b/extension/scripts/query.py new file mode 100644 index 00000000..f2e44413 --- /dev/null +++ b/extension/scripts/query.py @@ -0,0 +1,63 @@ +import subprocess +import sys +from gpt_index import GPTSimpleVectorIndex, GPTFaissIndex +import os +from typer import Typer +from enum import Enum +from update import update_codebase_index, create_codebase_index, index_dir_for, get_current_branch +from replace import replace_additional_index + +app = Typer() + +def query_codebase_index(query: str) -> str: + """Query the codebase index.""" + branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip() + path = 'data/{branch}/index.json' + if not os.path.exists(path): + print("No index found for the codebase") + return "" + index = GPTFaissIndex.load_from_disk(path) + return index.query(query) + +def query_additional_index(query: str) -> str: + """Query the additional index.""" + index = GPTSimpleVectorIndex.load_from_disk('data/additional_index.json') + return index.query(query) + +class IndexTypeOption(str, Enum): + codebase = "codebase" + additional = "additional" + +@app.command() +def query(context: IndexTypeOption, query: str): + if context == IndexTypeOption.additional: + response = query_additional_index(query) + elif context == IndexTypeOption.codebase: + response = query_codebase_index(query) + else: + print("Error: unknown context") + print({ "response": response }) + +@app.command() +def check_index_exists(root_path: str): + branch = get_current_branch() + exists = os.path.exists(index_dir_for(branch)) + print({ "exists": exists }) + +@app.command() +def update(): + update_codebase_index() + print("Updated codebase index") + +@app.command() +def create_index(path: str): + create_codebase_index() + print("Created file index") + +@app.command() +def replace_additional_index(info: str): + replace_additional_index() + print("Replaced additional index") + +if __name__ == '__main__': + app()
\ No newline at end of file diff --git a/extension/scripts/replace.py b/extension/scripts/replace.py new file mode 100644 index 00000000..08810243 --- /dev/null +++ b/extension/scripts/replace.py @@ -0,0 +1,17 @@ +import sys +from gpt_index import GPTSimpleVectorIndex, Document + +def replace_additional_index(info: str): + """Replace the additional index.""" + with open('data/additional_context.txt', 'w') as f: + f.write(info) + documents = [Document(info)] + index = GPTSimpleVectorIndex(documents) + index.save_to_disk('data/additional_index.json') + print("Additional index replaced") + +if __name__ == "__main__": + """python3 replace.py <info>""" + info = sys.argv[1] if len(sys.argv) > 1 else None + if info: + replace_additional_index(info)
\ No newline at end of file diff --git a/extension/scripts/requirements.txt b/extension/scripts/requirements.txt new file mode 100644 index 00000000..27b48444 --- /dev/null +++ b/extension/scripts/requirements.txt @@ -0,0 +1,6 @@ +# chromadb==0.3.10 +# pathspec==0.11.0 +# typer==0.7.0 +# pydantic +# pytest +./continuedev-0.1.0-py3-none-any.whl
\ No newline at end of file diff --git a/extension/scripts/run_continue_server.py b/extension/scripts/run_continue_server.py new file mode 100644 index 00000000..089cc54d --- /dev/null +++ b/extension/scripts/run_continue_server.py @@ -0,0 +1,4 @@ +from continuedev.server.main import run_server + +if __name__ == "__main__": + run_server() diff --git a/extension/scripts/typegen.js b/extension/scripts/typegen.js new file mode 100644 index 00000000..0bbff19e --- /dev/null +++ b/extension/scripts/typegen.js @@ -0,0 +1,53 @@ +const fs = require("fs"); +const path = require("path"); +const { compile } = require("json-schema-to-typescript"); + +function generateTypesForFile(inputPath, outputPath) { + let schema = JSON.parse(fs.readFileSync(inputPath, "utf8")); + let name = (inputPath.split("/").pop() || inputPath).split(".")[0]; + // This is to solve the issue of json-schema-to-typescript not supporting $ref at the top-level, which is what Pydantic generates for recursive types + if ("$ref" in schema) { + let temp = schema["$ref"]; + delete schema["$ref"]; + schema["allOf"] = [{ $ref: temp }]; + } + + compile(schema, name) + .then((ts) => { + fs.writeFileSync(path.join(outputPath, name + ".d.ts"), ts); + }) + .catch((e) => { + console.log("Error generating types for " + name); + throw e; + }); +} + +function generateAllSchemas(inputDir, outputDir) { + // get the current directory + try { + fs.readdirSync(inputDir).forEach((file) => { + if (file.endsWith(".json")) { + generateTypesForFile(path.join(inputDir, file), outputDir); + } + }); + } catch (e) { + console.log( + "Make sure you are running this script from the extension/ directory." + ); + throw e; + } +} + +function deleteAllInDir(dir) { + fs.readdirSync(dir).forEach((file) => { + if (file.endsWith(".d.ts")) { + fs.unlinkSync(path.join(dir, file)); + } + }); +} + +OUTPUT_DIR = "schema"; +INPUT_DIR = "../schema/json"; + +deleteAllInDir(OUTPUT_DIR); +generateAllSchemas(INPUT_DIR, OUTPUT_DIR); diff --git a/extension/scripts/update.py b/extension/scripts/update.py new file mode 100644 index 00000000..15ad6ac0 --- /dev/null +++ b/extension/scripts/update.py @@ -0,0 +1,185 @@ +# import faiss +import json +import os +import subprocess + +from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter +from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, Document, GPTFaissIndex +from typing import List, Generator, Tuple + +FILE_TYPES_TO_IGNORE = [ + '.pyc', + '.png', + '.jpg', + '.jpeg', + '.gif', + '.svg', + '.ico' +] + +def further_filter(files: List[str], root_dir: str): + """Further filter files before indexing.""" + for file in files: + if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'): + continue + yield root_dir + "/" + file + +def get_git_root_dir(path: str): + """Get the root directory of a Git repository.""" + try: + return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode() + except subprocess.CalledProcessError: + return None + +def get_git_ignored_files(root_dir: str): + """Get the list of ignored files in a Git repository.""" + try: + output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode() + return output.split('\n') + except subprocess.CalledProcessError: + return [] + +def get_all_files(root_dir: str): + """Get a list of all files in a directory.""" + for dir_path, _, file_names in os.walk(root_dir): + for file_name in file_names: + yield os.path.join(os.path.relpath(dir_path, root_dir), file_name) + +def get_input_files(root_dir: str): + """Get a list of all files in a Git repository that are not ignored.""" + ignored_files = set(get_git_ignored_files(root_dir)) + all_files = set(get_all_files(root_dir)) + nonignored_files = all_files - ignored_files + return further_filter(nonignored_files, root_dir) + +def load_gpt_index_documents(root: str) -> List[Document]: + """Loads a list of GPTIndex Documents, respecting .gitignore files.""" + # Get input files + input_files = get_input_files(root) + # Use SimpleDirectoryReader to load the files into Documents + return SimpleDirectoryReader(root, input_files=input_files, file_metadata=lambda filename: {"filename": filename}).load_data() + +def index_dir_for(branch: str) -> str: + return f"data/{branch}" + +def get_git_root_dir(): + result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return result.stdout.decode().strip() + +def get_current_branch() -> str: + return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip() + +def get_current_commit() -> str: + return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() + +def create_codebase_index(): + """Create a new index for the current branch.""" + branch = get_current_branch() + if not os.path.exists(index_dir_for(branch)): + os.makedirs(index_dir_for(branch)) + + documents = load_gpt_index_documents(get_git_root_dir()) + + chunks = {} + doc_chunks = [] + for doc in documents: + text_splitter = TokenTextSplitter() + text_chunks = text_splitter.split_text(doc.text) + filename = doc.extra_info["filename"] + chunks[filename] = len(text_chunks) + for i, text in enumerate(text_chunks): + doc_chunks.append(Document(text, doc_id=f"{filename}::{i}")) + + with open(f"{index_dir_for(branch)}/metadata.json", "w") as f: + json.dump({"commit": get_current_commit(), "chunks" : chunks}, f, indent=4) + + index = GPTSimpleVectorIndex([]) + for chunk in doc_chunks: + index.insert(chunk) + + # d = 1536 # Dimension of text-ada-embedding-002 + # faiss_index = faiss.IndexFlatL2(d) + # index = GPTFaissIndex(documents, faiss_index=faiss_index) + # index.save_to_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index") + + index.save_to_disk(f"{index_dir_for(branch)}/index.json") + + print("Codebase index created") + +def get_modified_deleted_files() -> Tuple[List[str], List[str]]: + """Get a list of all files that have been modified since the last commit.""" + branch = get_current_branch() + current_commit = get_current_commit() + + metadata = f"{index_dir_for(branch)}/metadata.json" + with open(metadata, "r") as f: + previous_commit = json.load(f)["commit"] + + modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit]).decode("utf-8").strip() + modified_deleted_files = modified_deleted_files.split("\n") + modified_deleted_files = [f for f in modified_deleted_files if f] + + root = get_git_root_dir() + deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)] + modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" + f)] + + return further_filter(modified_files, index_dir_for(branch)), further_filter(deleted_files, index_dir_for(branch)) + +def update_codebase_index(): + """Update the index with a list of files.""" + branch = get_current_branch() + + if not os.path.exists(index_dir_for(branch)): + create_codebase_index() + else: + # index = GPTFaissIndex.load_from_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index") + index = GPTSimpleVectorIndex.load_from_disk(f"{index_dir_for(branch)}/index.json") + modified_files, deleted_files = get_modified_deleted_files() + + with open(f"{index_dir_for(branch)}/metadata.json", "r") as f: + metadata = json.load(f) + + for file in deleted_files: + + num_chunks = metadata["chunks"][file] + for i in range(num_chunks): + index.delete(f"{file}::{i}") + + del metadata["chunks"][file] + + print(f"Deleted {file}") + + for file in modified_files: + + if file in metadata["chunks"]: + + num_chunks = metadata["chunks"][file] + + for i in range(num_chunks): + index.delete(f"{file}::{i}") + + print(f"Deleted old version of {file}") + + with open(file, "r") as f: + text = f.read() + + text_splitter = TokenTextSplitter() + text_chunks = text_splitter.split_text(text) + + for i, text in enumerate(text_chunks): + index.insert(Document(text, doc_id=f"{file}::{i}")) + + metadata["chunks"][file] = len(text_chunks) + + print(f"Inserted new version of {file}") + + metadata["commit"] = get_current_commit() + + with open(f"{index_dir_for(branch)}/metadata.json", "w") as f: + json.dump(metadata, f, indent=4) + + print("Codebase index updated") + +if __name__ == "__main__": + """python3 update.py""" + update_codebase_index()
\ No newline at end of file |