diff options
Diffstat (limited to 'extension/scripts')
-rw-r--r-- | extension/scripts/.gitignore | 5 | ||||
-rw-r--r-- | extension/scripts/README.md | 5 | ||||
-rw-r--r-- | extension/scripts/chroma.py | 152 | ||||
-rw-r--r-- | extension/scripts/index.py | 52 | ||||
-rw-r--r-- | extension/scripts/query.py | 63 | ||||
-rw-r--r-- | extension/scripts/replace.py | 17 | ||||
-rw-r--r-- | extension/scripts/requirements.txt | 6 | ||||
-rw-r--r-- | extension/scripts/run_continue_server.py | 4 | ||||
-rw-r--r-- | extension/scripts/update.py | 185 |
9 files changed, 0 insertions, 489 deletions
diff --git a/extension/scripts/.gitignore b/extension/scripts/.gitignore deleted file mode 100644 index fbb3bf9f..00000000 --- a/extension/scripts/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -testdb -env -stdout.txt -.continue_env_installed -**.whl
\ No newline at end of file diff --git a/extension/scripts/README.md b/extension/scripts/README.md deleted file mode 100644 index da1ad493..00000000 --- a/extension/scripts/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Scripts - -Whenever we need python to run on the client side, we include a script file at the top level of this folder. All other files that are not to be run directly as a script (utility files) should be in a subfolder of `scripts`. You can call one of these scripts from the VS Code extension using the `runPythonScript` function in `bridge.ts`. - -When the extension is activated (`activate` function in `src/extension.ts`), we call `setupPythonEnv`, which makes the virtual environment and downloads all the necessary requirements as given in `requirements.txt`. With this in mind, be sure to run `pip freeze > requirements.txt` whenever you add a new requirement. diff --git a/extension/scripts/chroma.py b/extension/scripts/chroma.py deleted file mode 100644 index 7425394e..00000000 --- a/extension/scripts/chroma.py +++ /dev/null @@ -1,152 +0,0 @@ -import chromadb -import os -import json -import subprocess - -from typing import List, Tuple - -from chromadb.config import Settings - -client = chromadb.Client(Settings( - chroma_db_impl="duckdb+parquet", - persist_directory="./data/" -)) - -FILE_TYPES_TO_IGNORE = [ - '.pyc', - '.png', - '.jpg', - '.jpeg', - '.gif', - '.svg', - '.ico' -] - -def further_filter(files: List[str], root_dir: str): - """Further filter files before indexing.""" - for file in files: - if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'): - continue - yield root_dir + "/" + file - -def get_git_root_dir(path: str): - """Get the root directory of a Git repository.""" - try: - return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode() - except subprocess.CalledProcessError: - return None - -def get_git_ignored_files(root_dir: str): - """Get the list of ignored files in a Git repository.""" - try: - output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode() - return output.split('\n') - except subprocess.CalledProcessError: - return [] - -def get_all_files(root_dir: str): - """Get a list of all files in a directory.""" - for dir_path, _, file_names in os.walk(root_dir): - for file_name in file_names: - yield os.path.join(os.path.relpath(dir_path, root_dir), file_name) - -def get_input_files(root_dir: str): - """Get a list of all files in a Git repository that are not ignored.""" - ignored_files = set(get_git_ignored_files(root_dir)) - all_files = set(get_all_files(root_dir)) - nonignored_files = all_files - ignored_files - return further_filter(nonignored_files, root_dir) - -def get_git_root_dir(cwd: str): - """Get the root directory of a Git repository.""" - result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) - return result.stdout.decode().strip() - -def get_current_branch(cwd: str) -> str: - """Get the current Git branch.""" - try: - return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=cwd).decode("utf-8").strip() - except: - return "main" - -def get_current_commit(cwd: str) -> str: - try: - return subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd).decode("utf-8").strip() - except: - return "NO_COMMITS" - -def get_modified_deleted_files(cwd: str) -> Tuple[List[str], List[str]]: - """Get a list of all files that have been modified since the last commit.""" - branch = get_current_branch(cwd) - current_commit = get_current_commit(cwd) - - with open(f"./data/{branch}.json", 'r') as f: - previous_commit = json.load(f)["commit"] - - modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit], cwd=cwd).decode("utf-8").strip() - modified_deleted_files = modified_deleted_files.split("\n") - modified_deleted_files = [f for f in modified_deleted_files if f] - - root = get_git_root_dir(cwd) - deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)] - modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" + f)] - - return further_filter(modified_files, root), further_filter(deleted_files, root) - -def create_collection(branch: str, cwd: str): - """Create a new collection, returning whether it already existed.""" - try: - collection = client.create_collection(name=branch) - except Exception as e: - print(e) - return - - files = get_input_files(get_git_root_dir(cwd)) - for file in files: - with open(file, 'r') as f: - collection.add(documents=[f.read()], ids=[file]) - print(f"Added {file}") - with open(f"./data/{branch}.json", 'w') as f: - json.dump({"commit": get_current_commit(cwd)}, f) - -def collection_exists(cwd: str): - """Check if a collection exists.""" - branch = get_current_branch(cwd) - return branch in client.list_collections() - -def update_collection(cwd: str): - """Update the collection.""" - branch = get_current_branch(cwd) - - try: - - collection = client.get_collection(branch) - - modified_files, deleted_files = get_modified_deleted_files(cwd) - - for file in deleted_files: - collection.delete(ids=[file]) - print(f"Deleted {file}") - - for file in modified_files: - with open(file, 'r') as f: - collection.update(documents=[f.read()], ids=[file]) - print(f"Updated {file}") - - with open(f"./data/{branch}.json", 'w') as f: - json.dump({"commit": get_current_commit(cwd)}, f) - - except: - - create_collection(branch, cwd) - -def query_collection(query: str, n_results: int, cwd: str): - """Query the collection.""" - branch = get_current_branch(cwd) - try: - collection = client.get_collection(branch) - except: - create_collection(branch, cwd) - collection = client.get_collection(branch) - results = collection.query(query_texts=[query], n_results=n_results) - return results
\ No newline at end of file diff --git a/extension/scripts/index.py b/extension/scripts/index.py deleted file mode 100644 index 3afc9131..00000000 --- a/extension/scripts/index.py +++ /dev/null @@ -1,52 +0,0 @@ -import sys -import os -from typing import TextIO -from chroma import update_collection, query_collection, create_collection, collection_exists, get_current_branch -from typer import Typer - -app = Typer() - -class SilenceStdoutContextManager: - saved_stdout: TextIO - - def __enter__(self): - self._original_stdout = sys.stdout - sys.stdout = open(os.devnull, 'w') - - def __exit__(self, exc_type, exc_val, exc_tb): - sys.stdout.close() - sys.stdout = self._original_stdout - -silence = SilenceStdoutContextManager() - -@app.command("exists") -def exists(cwd: str): - with silence: - exists = collection_exists(cwd) - print({"exists": exists}) - -@app.command("create") -def create(cwd: str): - with silence: - branch = get_current_branch(cwd) - create_collection(branch, cwd) - print({"success": True}) - -@app.command("update") -def update(cwd: str): - with silence: - update_collection(cwd) - print({"success": True}) - -@app.command("query") -def query(query: str, n_results: int, cwd: str): - with silence: - resp = query_collection(query, n_results, cwd) - results = [{ - "id": resp["ids"][0][i], - "document": resp["documents"][0][i] - } for i in range(len(resp["ids"][0]))] - print({"results": results}) - -if __name__ == "__main__": - app()
\ No newline at end of file diff --git a/extension/scripts/query.py b/extension/scripts/query.py deleted file mode 100644 index f2e44413..00000000 --- a/extension/scripts/query.py +++ /dev/null @@ -1,63 +0,0 @@ -import subprocess -import sys -from gpt_index import GPTSimpleVectorIndex, GPTFaissIndex -import os -from typer import Typer -from enum import Enum -from update import update_codebase_index, create_codebase_index, index_dir_for, get_current_branch -from replace import replace_additional_index - -app = Typer() - -def query_codebase_index(query: str) -> str: - """Query the codebase index.""" - branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip() - path = 'data/{branch}/index.json' - if not os.path.exists(path): - print("No index found for the codebase") - return "" - index = GPTFaissIndex.load_from_disk(path) - return index.query(query) - -def query_additional_index(query: str) -> str: - """Query the additional index.""" - index = GPTSimpleVectorIndex.load_from_disk('data/additional_index.json') - return index.query(query) - -class IndexTypeOption(str, Enum): - codebase = "codebase" - additional = "additional" - -@app.command() -def query(context: IndexTypeOption, query: str): - if context == IndexTypeOption.additional: - response = query_additional_index(query) - elif context == IndexTypeOption.codebase: - response = query_codebase_index(query) - else: - print("Error: unknown context") - print({ "response": response }) - -@app.command() -def check_index_exists(root_path: str): - branch = get_current_branch() - exists = os.path.exists(index_dir_for(branch)) - print({ "exists": exists }) - -@app.command() -def update(): - update_codebase_index() - print("Updated codebase index") - -@app.command() -def create_index(path: str): - create_codebase_index() - print("Created file index") - -@app.command() -def replace_additional_index(info: str): - replace_additional_index() - print("Replaced additional index") - -if __name__ == '__main__': - app()
\ No newline at end of file diff --git a/extension/scripts/replace.py b/extension/scripts/replace.py deleted file mode 100644 index 08810243..00000000 --- a/extension/scripts/replace.py +++ /dev/null @@ -1,17 +0,0 @@ -import sys -from gpt_index import GPTSimpleVectorIndex, Document - -def replace_additional_index(info: str): - """Replace the additional index.""" - with open('data/additional_context.txt', 'w') as f: - f.write(info) - documents = [Document(info)] - index = GPTSimpleVectorIndex(documents) - index.save_to_disk('data/additional_index.json') - print("Additional index replaced") - -if __name__ == "__main__": - """python3 replace.py <info>""" - info = sys.argv[1] if len(sys.argv) > 1 else None - if info: - replace_additional_index(info)
\ No newline at end of file diff --git a/extension/scripts/requirements.txt b/extension/scripts/requirements.txt deleted file mode 100644 index c51c9d73..00000000 --- a/extension/scripts/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -# chromadb==0.3.10 -# pathspec==0.11.0 -# typer==0.7.0 -# pydantic -# pytest -./continuedev-0.1.2-py3-none-any.whl
\ No newline at end of file diff --git a/extension/scripts/run_continue_server.py b/extension/scripts/run_continue_server.py deleted file mode 100644 index 089cc54d..00000000 --- a/extension/scripts/run_continue_server.py +++ /dev/null @@ -1,4 +0,0 @@ -from continuedev.server.main import run_server - -if __name__ == "__main__": - run_server() diff --git a/extension/scripts/update.py b/extension/scripts/update.py deleted file mode 100644 index 15ad6ac0..00000000 --- a/extension/scripts/update.py +++ /dev/null @@ -1,185 +0,0 @@ -# import faiss -import json -import os -import subprocess - -from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter -from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, Document, GPTFaissIndex -from typing import List, Generator, Tuple - -FILE_TYPES_TO_IGNORE = [ - '.pyc', - '.png', - '.jpg', - '.jpeg', - '.gif', - '.svg', - '.ico' -] - -def further_filter(files: List[str], root_dir: str): - """Further filter files before indexing.""" - for file in files: - if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'): - continue - yield root_dir + "/" + file - -def get_git_root_dir(path: str): - """Get the root directory of a Git repository.""" - try: - return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode() - except subprocess.CalledProcessError: - return None - -def get_git_ignored_files(root_dir: str): - """Get the list of ignored files in a Git repository.""" - try: - output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode() - return output.split('\n') - except subprocess.CalledProcessError: - return [] - -def get_all_files(root_dir: str): - """Get a list of all files in a directory.""" - for dir_path, _, file_names in os.walk(root_dir): - for file_name in file_names: - yield os.path.join(os.path.relpath(dir_path, root_dir), file_name) - -def get_input_files(root_dir: str): - """Get a list of all files in a Git repository that are not ignored.""" - ignored_files = set(get_git_ignored_files(root_dir)) - all_files = set(get_all_files(root_dir)) - nonignored_files = all_files - ignored_files - return further_filter(nonignored_files, root_dir) - -def load_gpt_index_documents(root: str) -> List[Document]: - """Loads a list of GPTIndex Documents, respecting .gitignore files.""" - # Get input files - input_files = get_input_files(root) - # Use SimpleDirectoryReader to load the files into Documents - return SimpleDirectoryReader(root, input_files=input_files, file_metadata=lambda filename: {"filename": filename}).load_data() - -def index_dir_for(branch: str) -> str: - return f"data/{branch}" - -def get_git_root_dir(): - result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return result.stdout.decode().strip() - -def get_current_branch() -> str: - return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip() - -def get_current_commit() -> str: - return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() - -def create_codebase_index(): - """Create a new index for the current branch.""" - branch = get_current_branch() - if not os.path.exists(index_dir_for(branch)): - os.makedirs(index_dir_for(branch)) - - documents = load_gpt_index_documents(get_git_root_dir()) - - chunks = {} - doc_chunks = [] - for doc in documents: - text_splitter = TokenTextSplitter() - text_chunks = text_splitter.split_text(doc.text) - filename = doc.extra_info["filename"] - chunks[filename] = len(text_chunks) - for i, text in enumerate(text_chunks): - doc_chunks.append(Document(text, doc_id=f"{filename}::{i}")) - - with open(f"{index_dir_for(branch)}/metadata.json", "w") as f: - json.dump({"commit": get_current_commit(), "chunks" : chunks}, f, indent=4) - - index = GPTSimpleVectorIndex([]) - for chunk in doc_chunks: - index.insert(chunk) - - # d = 1536 # Dimension of text-ada-embedding-002 - # faiss_index = faiss.IndexFlatL2(d) - # index = GPTFaissIndex(documents, faiss_index=faiss_index) - # index.save_to_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index") - - index.save_to_disk(f"{index_dir_for(branch)}/index.json") - - print("Codebase index created") - -def get_modified_deleted_files() -> Tuple[List[str], List[str]]: - """Get a list of all files that have been modified since the last commit.""" - branch = get_current_branch() - current_commit = get_current_commit() - - metadata = f"{index_dir_for(branch)}/metadata.json" - with open(metadata, "r") as f: - previous_commit = json.load(f)["commit"] - - modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit]).decode("utf-8").strip() - modified_deleted_files = modified_deleted_files.split("\n") - modified_deleted_files = [f for f in modified_deleted_files if f] - - root = get_git_root_dir() - deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)] - modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" + f)] - - return further_filter(modified_files, index_dir_for(branch)), further_filter(deleted_files, index_dir_for(branch)) - -def update_codebase_index(): - """Update the index with a list of files.""" - branch = get_current_branch() - - if not os.path.exists(index_dir_for(branch)): - create_codebase_index() - else: - # index = GPTFaissIndex.load_from_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index") - index = GPTSimpleVectorIndex.load_from_disk(f"{index_dir_for(branch)}/index.json") - modified_files, deleted_files = get_modified_deleted_files() - - with open(f"{index_dir_for(branch)}/metadata.json", "r") as f: - metadata = json.load(f) - - for file in deleted_files: - - num_chunks = metadata["chunks"][file] - for i in range(num_chunks): - index.delete(f"{file}::{i}") - - del metadata["chunks"][file] - - print(f"Deleted {file}") - - for file in modified_files: - - if file in metadata["chunks"]: - - num_chunks = metadata["chunks"][file] - - for i in range(num_chunks): - index.delete(f"{file}::{i}") - - print(f"Deleted old version of {file}") - - with open(file, "r") as f: - text = f.read() - - text_splitter = TokenTextSplitter() - text_chunks = text_splitter.split_text(text) - - for i, text in enumerate(text_chunks): - index.insert(Document(text, doc_id=f"{file}::{i}")) - - metadata["chunks"][file] = len(text_chunks) - - print(f"Inserted new version of {file}") - - metadata["commit"] = get_current_commit() - - with open(f"{index_dir_for(branch)}/metadata.json", "w") as f: - json.dump(metadata, f, indent=4) - - print("Codebase index updated") - -if __name__ == "__main__": - """python3 update.py""" - update_codebase_index()
\ No newline at end of file |