summaryrefslogtreecommitdiff
path: root/extension/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'extension/scripts')
-rw-r--r--extension/scripts/.gitignore5
-rw-r--r--extension/scripts/README.md5
-rw-r--r--extension/scripts/chroma.py152
-rw-r--r--extension/scripts/index.py52
-rw-r--r--extension/scripts/query.py63
-rw-r--r--extension/scripts/replace.py17
-rw-r--r--extension/scripts/requirements.txt6
-rw-r--r--extension/scripts/run_continue_server.py4
-rw-r--r--extension/scripts/update.py185
9 files changed, 0 insertions, 489 deletions
diff --git a/extension/scripts/.gitignore b/extension/scripts/.gitignore
deleted file mode 100644
index fbb3bf9f..00000000
--- a/extension/scripts/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-testdb
-env
-stdout.txt
-.continue_env_installed
-**.whl \ No newline at end of file
diff --git a/extension/scripts/README.md b/extension/scripts/README.md
deleted file mode 100644
index da1ad493..00000000
--- a/extension/scripts/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Scripts
-
-Whenever we need python to run on the client side, we include a script file at the top level of this folder. All other files that are not to be run directly as a script (utility files) should be in a subfolder of `scripts`. You can call one of these scripts from the VS Code extension using the `runPythonScript` function in `bridge.ts`.
-
-When the extension is activated (`activate` function in `src/extension.ts`), we call `setupPythonEnv`, which makes the virtual environment and downloads all the necessary requirements as given in `requirements.txt`. With this in mind, be sure to run `pip freeze > requirements.txt` whenever you add a new requirement.
diff --git a/extension/scripts/chroma.py b/extension/scripts/chroma.py
deleted file mode 100644
index 7425394e..00000000
--- a/extension/scripts/chroma.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import chromadb
-import os
-import json
-import subprocess
-
-from typing import List, Tuple
-
-from chromadb.config import Settings
-
-client = chromadb.Client(Settings(
- chroma_db_impl="duckdb+parquet",
- persist_directory="./data/"
-))
-
-FILE_TYPES_TO_IGNORE = [
- '.pyc',
- '.png',
- '.jpg',
- '.jpeg',
- '.gif',
- '.svg',
- '.ico'
-]
-
-def further_filter(files: List[str], root_dir: str):
- """Further filter files before indexing."""
- for file in files:
- if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'):
- continue
- yield root_dir + "/" + file
-
-def get_git_root_dir(path: str):
- """Get the root directory of a Git repository."""
- try:
- return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode()
- except subprocess.CalledProcessError:
- return None
-
-def get_git_ignored_files(root_dir: str):
- """Get the list of ignored files in a Git repository."""
- try:
- output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode()
- return output.split('\n')
- except subprocess.CalledProcessError:
- return []
-
-def get_all_files(root_dir: str):
- """Get a list of all files in a directory."""
- for dir_path, _, file_names in os.walk(root_dir):
- for file_name in file_names:
- yield os.path.join(os.path.relpath(dir_path, root_dir), file_name)
-
-def get_input_files(root_dir: str):
- """Get a list of all files in a Git repository that are not ignored."""
- ignored_files = set(get_git_ignored_files(root_dir))
- all_files = set(get_all_files(root_dir))
- nonignored_files = all_files - ignored_files
- return further_filter(nonignored_files, root_dir)
-
-def get_git_root_dir(cwd: str):
- """Get the root directory of a Git repository."""
- result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd)
- return result.stdout.decode().strip()
-
-def get_current_branch(cwd: str) -> str:
- """Get the current Git branch."""
- try:
- return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=cwd).decode("utf-8").strip()
- except:
- return "main"
-
-def get_current_commit(cwd: str) -> str:
- try:
- return subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd).decode("utf-8").strip()
- except:
- return "NO_COMMITS"
-
-def get_modified_deleted_files(cwd: str) -> Tuple[List[str], List[str]]:
- """Get a list of all files that have been modified since the last commit."""
- branch = get_current_branch(cwd)
- current_commit = get_current_commit(cwd)
-
- with open(f"./data/{branch}.json", 'r') as f:
- previous_commit = json.load(f)["commit"]
-
- modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit], cwd=cwd).decode("utf-8").strip()
- modified_deleted_files = modified_deleted_files.split("\n")
- modified_deleted_files = [f for f in modified_deleted_files if f]
-
- root = get_git_root_dir(cwd)
- deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)]
- modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" + f)]
-
- return further_filter(modified_files, root), further_filter(deleted_files, root)
-
-def create_collection(branch: str, cwd: str):
- """Create a new collection, returning whether it already existed."""
- try:
- collection = client.create_collection(name=branch)
- except Exception as e:
- print(e)
- return
-
- files = get_input_files(get_git_root_dir(cwd))
- for file in files:
- with open(file, 'r') as f:
- collection.add(documents=[f.read()], ids=[file])
- print(f"Added {file}")
- with open(f"./data/{branch}.json", 'w') as f:
- json.dump({"commit": get_current_commit(cwd)}, f)
-
-def collection_exists(cwd: str):
- """Check if a collection exists."""
- branch = get_current_branch(cwd)
- return branch in client.list_collections()
-
-def update_collection(cwd: str):
- """Update the collection."""
- branch = get_current_branch(cwd)
-
- try:
-
- collection = client.get_collection(branch)
-
- modified_files, deleted_files = get_modified_deleted_files(cwd)
-
- for file in deleted_files:
- collection.delete(ids=[file])
- print(f"Deleted {file}")
-
- for file in modified_files:
- with open(file, 'r') as f:
- collection.update(documents=[f.read()], ids=[file])
- print(f"Updated {file}")
-
- with open(f"./data/{branch}.json", 'w') as f:
- json.dump({"commit": get_current_commit(cwd)}, f)
-
- except:
-
- create_collection(branch, cwd)
-
-def query_collection(query: str, n_results: int, cwd: str):
- """Query the collection."""
- branch = get_current_branch(cwd)
- try:
- collection = client.get_collection(branch)
- except:
- create_collection(branch, cwd)
- collection = client.get_collection(branch)
- results = collection.query(query_texts=[query], n_results=n_results)
- return results \ No newline at end of file
diff --git a/extension/scripts/index.py b/extension/scripts/index.py
deleted file mode 100644
index 3afc9131..00000000
--- a/extension/scripts/index.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import sys
-import os
-from typing import TextIO
-from chroma import update_collection, query_collection, create_collection, collection_exists, get_current_branch
-from typer import Typer
-
-app = Typer()
-
-class SilenceStdoutContextManager:
- saved_stdout: TextIO
-
- def __enter__(self):
- self._original_stdout = sys.stdout
- sys.stdout = open(os.devnull, 'w')
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- sys.stdout.close()
- sys.stdout = self._original_stdout
-
-silence = SilenceStdoutContextManager()
-
-@app.command("exists")
-def exists(cwd: str):
- with silence:
- exists = collection_exists(cwd)
- print({"exists": exists})
-
-@app.command("create")
-def create(cwd: str):
- with silence:
- branch = get_current_branch(cwd)
- create_collection(branch, cwd)
- print({"success": True})
-
-@app.command("update")
-def update(cwd: str):
- with silence:
- update_collection(cwd)
- print({"success": True})
-
-@app.command("query")
-def query(query: str, n_results: int, cwd: str):
- with silence:
- resp = query_collection(query, n_results, cwd)
- results = [{
- "id": resp["ids"][0][i],
- "document": resp["documents"][0][i]
- } for i in range(len(resp["ids"][0]))]
- print({"results": results})
-
-if __name__ == "__main__":
- app() \ No newline at end of file
diff --git a/extension/scripts/query.py b/extension/scripts/query.py
deleted file mode 100644
index f2e44413..00000000
--- a/extension/scripts/query.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import subprocess
-import sys
-from gpt_index import GPTSimpleVectorIndex, GPTFaissIndex
-import os
-from typer import Typer
-from enum import Enum
-from update import update_codebase_index, create_codebase_index, index_dir_for, get_current_branch
-from replace import replace_additional_index
-
-app = Typer()
-
-def query_codebase_index(query: str) -> str:
- """Query the codebase index."""
- branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip()
- path = 'data/{branch}/index.json'
- if not os.path.exists(path):
- print("No index found for the codebase")
- return ""
- index = GPTFaissIndex.load_from_disk(path)
- return index.query(query)
-
-def query_additional_index(query: str) -> str:
- """Query the additional index."""
- index = GPTSimpleVectorIndex.load_from_disk('data/additional_index.json')
- return index.query(query)
-
-class IndexTypeOption(str, Enum):
- codebase = "codebase"
- additional = "additional"
-
-@app.command()
-def query(context: IndexTypeOption, query: str):
- if context == IndexTypeOption.additional:
- response = query_additional_index(query)
- elif context == IndexTypeOption.codebase:
- response = query_codebase_index(query)
- else:
- print("Error: unknown context")
- print({ "response": response })
-
-@app.command()
-def check_index_exists(root_path: str):
- branch = get_current_branch()
- exists = os.path.exists(index_dir_for(branch))
- print({ "exists": exists })
-
-@app.command()
-def update():
- update_codebase_index()
- print("Updated codebase index")
-
-@app.command()
-def create_index(path: str):
- create_codebase_index()
- print("Created file index")
-
-@app.command()
-def replace_additional_index(info: str):
- replace_additional_index()
- print("Replaced additional index")
-
-if __name__ == '__main__':
- app() \ No newline at end of file
diff --git a/extension/scripts/replace.py b/extension/scripts/replace.py
deleted file mode 100644
index 08810243..00000000
--- a/extension/scripts/replace.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import sys
-from gpt_index import GPTSimpleVectorIndex, Document
-
-def replace_additional_index(info: str):
- """Replace the additional index."""
- with open('data/additional_context.txt', 'w') as f:
- f.write(info)
- documents = [Document(info)]
- index = GPTSimpleVectorIndex(documents)
- index.save_to_disk('data/additional_index.json')
- print("Additional index replaced")
-
-if __name__ == "__main__":
- """python3 replace.py <info>"""
- info = sys.argv[1] if len(sys.argv) > 1 else None
- if info:
- replace_additional_index(info) \ No newline at end of file
diff --git a/extension/scripts/requirements.txt b/extension/scripts/requirements.txt
deleted file mode 100644
index c51c9d73..00000000
--- a/extension/scripts/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# chromadb==0.3.10
-# pathspec==0.11.0
-# typer==0.7.0
-# pydantic
-# pytest
-./continuedev-0.1.2-py3-none-any.whl \ No newline at end of file
diff --git a/extension/scripts/run_continue_server.py b/extension/scripts/run_continue_server.py
deleted file mode 100644
index 089cc54d..00000000
--- a/extension/scripts/run_continue_server.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from continuedev.server.main import run_server
-
-if __name__ == "__main__":
- run_server()
diff --git a/extension/scripts/update.py b/extension/scripts/update.py
deleted file mode 100644
index 15ad6ac0..00000000
--- a/extension/scripts/update.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# import faiss
-import json
-import os
-import subprocess
-
-from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter
-from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, Document, GPTFaissIndex
-from typing import List, Generator, Tuple
-
-FILE_TYPES_TO_IGNORE = [
- '.pyc',
- '.png',
- '.jpg',
- '.jpeg',
- '.gif',
- '.svg',
- '.ico'
-]
-
-def further_filter(files: List[str], root_dir: str):
- """Further filter files before indexing."""
- for file in files:
- if file.endswith(tuple(FILE_TYPES_TO_IGNORE)) or file.startswith('.git') or file.startswith('archive'):
- continue
- yield root_dir + "/" + file
-
-def get_git_root_dir(path: str):
- """Get the root directory of a Git repository."""
- try:
- return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=path).strip().decode()
- except subprocess.CalledProcessError:
- return None
-
-def get_git_ignored_files(root_dir: str):
- """Get the list of ignored files in a Git repository."""
- try:
- output = subprocess.check_output(['git', 'ls-files', '--ignored', '--others', '--exclude-standard'], cwd=root_dir).strip().decode()
- return output.split('\n')
- except subprocess.CalledProcessError:
- return []
-
-def get_all_files(root_dir: str):
- """Get a list of all files in a directory."""
- for dir_path, _, file_names in os.walk(root_dir):
- for file_name in file_names:
- yield os.path.join(os.path.relpath(dir_path, root_dir), file_name)
-
-def get_input_files(root_dir: str):
- """Get a list of all files in a Git repository that are not ignored."""
- ignored_files = set(get_git_ignored_files(root_dir))
- all_files = set(get_all_files(root_dir))
- nonignored_files = all_files - ignored_files
- return further_filter(nonignored_files, root_dir)
-
-def load_gpt_index_documents(root: str) -> List[Document]:
- """Loads a list of GPTIndex Documents, respecting .gitignore files."""
- # Get input files
- input_files = get_input_files(root)
- # Use SimpleDirectoryReader to load the files into Documents
- return SimpleDirectoryReader(root, input_files=input_files, file_metadata=lambda filename: {"filename": filename}).load_data()
-
-def index_dir_for(branch: str) -> str:
- return f"data/{branch}"
-
-def get_git_root_dir():
- result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- return result.stdout.decode().strip()
-
-def get_current_branch() -> str:
- return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("utf-8").strip()
-
-def get_current_commit() -> str:
- return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
-
-def create_codebase_index():
- """Create a new index for the current branch."""
- branch = get_current_branch()
- if not os.path.exists(index_dir_for(branch)):
- os.makedirs(index_dir_for(branch))
-
- documents = load_gpt_index_documents(get_git_root_dir())
-
- chunks = {}
- doc_chunks = []
- for doc in documents:
- text_splitter = TokenTextSplitter()
- text_chunks = text_splitter.split_text(doc.text)
- filename = doc.extra_info["filename"]
- chunks[filename] = len(text_chunks)
- for i, text in enumerate(text_chunks):
- doc_chunks.append(Document(text, doc_id=f"{filename}::{i}"))
-
- with open(f"{index_dir_for(branch)}/metadata.json", "w") as f:
- json.dump({"commit": get_current_commit(), "chunks" : chunks}, f, indent=4)
-
- index = GPTSimpleVectorIndex([])
- for chunk in doc_chunks:
- index.insert(chunk)
-
- # d = 1536 # Dimension of text-ada-embedding-002
- # faiss_index = faiss.IndexFlatL2(d)
- # index = GPTFaissIndex(documents, faiss_index=faiss_index)
- # index.save_to_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index")
-
- index.save_to_disk(f"{index_dir_for(branch)}/index.json")
-
- print("Codebase index created")
-
-def get_modified_deleted_files() -> Tuple[List[str], List[str]]:
- """Get a list of all files that have been modified since the last commit."""
- branch = get_current_branch()
- current_commit = get_current_commit()
-
- metadata = f"{index_dir_for(branch)}/metadata.json"
- with open(metadata, "r") as f:
- previous_commit = json.load(f)["commit"]
-
- modified_deleted_files = subprocess.check_output(["git", "diff", "--name-only", previous_commit, current_commit]).decode("utf-8").strip()
- modified_deleted_files = modified_deleted_files.split("\n")
- modified_deleted_files = [f for f in modified_deleted_files if f]
-
- root = get_git_root_dir()
- deleted_files = [f for f in modified_deleted_files if not os.path.exists(root + "/" + f)]
- modified_files = [f for f in modified_deleted_files if os.path.exists(root + "/" + f)]
-
- return further_filter(modified_files, index_dir_for(branch)), further_filter(deleted_files, index_dir_for(branch))
-
-def update_codebase_index():
- """Update the index with a list of files."""
- branch = get_current_branch()
-
- if not os.path.exists(index_dir_for(branch)):
- create_codebase_index()
- else:
- # index = GPTFaissIndex.load_from_disk(f"{index_dir_for(branch)}/index.json", faiss_index_save_path=f"{index_dir_for(branch)}/index_faiss_core.index")
- index = GPTSimpleVectorIndex.load_from_disk(f"{index_dir_for(branch)}/index.json")
- modified_files, deleted_files = get_modified_deleted_files()
-
- with open(f"{index_dir_for(branch)}/metadata.json", "r") as f:
- metadata = json.load(f)
-
- for file in deleted_files:
-
- num_chunks = metadata["chunks"][file]
- for i in range(num_chunks):
- index.delete(f"{file}::{i}")
-
- del metadata["chunks"][file]
-
- print(f"Deleted {file}")
-
- for file in modified_files:
-
- if file in metadata["chunks"]:
-
- num_chunks = metadata["chunks"][file]
-
- for i in range(num_chunks):
- index.delete(f"{file}::{i}")
-
- print(f"Deleted old version of {file}")
-
- with open(file, "r") as f:
- text = f.read()
-
- text_splitter = TokenTextSplitter()
- text_chunks = text_splitter.split_text(text)
-
- for i, text in enumerate(text_chunks):
- index.insert(Document(text, doc_id=f"{file}::{i}"))
-
- metadata["chunks"][file] = len(text_chunks)
-
- print(f"Inserted new version of {file}")
-
- metadata["commit"] = get_current_commit()
-
- with open(f"{index_dir_for(branch)}/metadata.json", "w") as f:
- json.dump(metadata, f, indent=4)
-
- print("Codebase index updated")
-
-if __name__ == "__main__":
- """python3 update.py"""
- update_codebase_index() \ No newline at end of file