diff options
Diffstat (limited to 'server/continuedev/plugins/recipes')
21 files changed, 1184 insertions, 0 deletions
diff --git a/server/continuedev/plugins/recipes/AddTransformRecipe/README.md b/server/continuedev/plugins/recipes/AddTransformRecipe/README.md new file mode 100644 index 00000000..78d603a2 --- /dev/null +++ b/server/continuedev/plugins/recipes/AddTransformRecipe/README.md @@ -0,0 +1,9 @@ +# AddTransformRecipe
+
+Uses the Chess.com API example to show how to add map and filter Python transforms to a dlt pipeline.
+
+Background
+
+- https://dlthub.com/docs/general-usage/resource#filter-transform-and-pivot-data
+- https://dlthub.com/docs/customizations/customizing-pipelines/renaming_columns
+- https://dlthub.com/docs/customizations/customizing-pipelines/pseudonymizing_columns
diff --git a/server/continuedev/plugins/recipes/AddTransformRecipe/dlt_transform_docs.md b/server/continuedev/plugins/recipes/AddTransformRecipe/dlt_transform_docs.md new file mode 100644 index 00000000..864aea87 --- /dev/null +++ b/server/continuedev/plugins/recipes/AddTransformRecipe/dlt_transform_docs.md @@ -0,0 +1,142 @@ +# Customize resources + +## Filter, transform and pivot data + +You can attach any number of transformations that are evaluated on item per item basis to your resource. The available transformation types: + +- map - transform the data item (resource.add_map) +- filter - filter the data item (resource.add_filter) +- yield map - a map that returns iterator (so single row may generate many rows - resource.add_yield_map) + +Example: We have a resource that loads a list of users from an api endpoint. We want to customize it so: + +- we remove users with user_id == 'me' +- we anonymize user data + Here's our resource: + +```python +import dlt + +@dlt.resource(write_disposition='replace') +def users(): + ... + users = requests.get(...) + ... + yield users +``` + +Here's our script that defines transformations and loads the data. + +```python +from pipedrive import users + +def anonymize_user(user_data): + user_data['user_id'] = hash_str(user_data['user_id']) + user_data['user_email'] = hash_str(user_data['user_email']) + return user_data + +# add the filter and anonymize function to users resource and enumerate +for user in users().add_filter(lambda user: user['user_id'] != 'me').add_map(anonymize_user): +print(user) +``` + +Here is a more complex example of a filter transformation: + + # Renaming columns + ## Renaming columns by replacing the special characters + + In the example below, we create a dummy source with special characters in the name. We then write a function that we intend to apply to the resource to modify its output (i.e. replacing the German umlaut): replace_umlauts_in_dict_keys. + ```python + import dlt + + # create a dummy source with umlauts (special characters) in key names (um) + @dlt.source + def dummy_source(prefix: str = None): + @dlt.resource + def dummy_data(): + for _ in range(100): + yield {f'Objekt_{_}':{'Größe':_, 'Äquivalenzprüfung':True}} + return dummy_data(), + + def replace_umlauts_in_dict_keys(d): + # Replaces umlauts in dictionary keys with standard characters. + umlaut_map = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss', 'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue'} + result = {} + for k, v in d.items(): + new_key = ''.join(umlaut_map.get(c, c) for c in k) + if isinstance(v, dict): + result[new_key] = replace_umlauts_in_dict_keys(v) + else: + result[new_key] = v + return result + + # We can add the map function to the resource + + # 1. Create an instance of the source so you can edit it. + data_source = dummy_source() + + # 2. Modify this source instance's resource + data_source = data_source.dummy_data().add_map(replace_umlauts_in_dict_keys) + + # 3. Inspect your result + for row in data_source: + print(row) + + # {'Objekt_0': {'Groesse': 0, 'Aequivalenzpruefung': True}} + # ... + ``` + +Here is a more complex example of a map transformation: + +# Pseudonymizing columns + +## Pseudonymizing (or anonymizing) columns by replacing the special characters + +Pseudonymization is a deterministic way to hide personally identifiable info (PII), enabling us to consistently achieve the same mapping. If instead you wish to anonymize, you can delete the data, or replace it with a constant. In the example below, we create a dummy source with a PII column called 'name', which we replace with deterministic hashes (i.e. replacing the German umlaut). + +```python +import dlt +import hashlib + +@dlt.source +def dummy_source(prefix: str = None): + @dlt.resource + def dummy_data(): + for _ in range(3): + yield {'id':_, 'name': f'Jane Washington {_}'} + return dummy_data(), + +def pseudonymize_name(doc): + Pseudonmyisation is a deterministic type of PII-obscuring + Its role is to allow identifying users by their hash, without revealing the underlying info. + + # add a constant salt to generate + salt = 'WI@N57%zZrmk#88c' + salted_string = doc['name'] + salt + sh = hashlib.sha256() + sh.update(salted_string.encode()) + hashed_string = sh.digest().hex() + doc['name'] = hashed_string + return doc + + # run it as is + for row in dummy_source().dummy_data().add_map(pseudonymize_name): + print(row) + + #{'id': 0, 'name': '96259edb2b28b48bebce8278c550e99fbdc4a3fac8189e6b90f183ecff01c442'} + #{'id': 1, 'name': '92d3972b625cbd21f28782fb5c89552ce1aa09281892a2ab32aee8feeb3544a1'} + #{'id': 2, 'name': '443679926a7cff506a3b5d5d094dc7734861352b9e0791af5d39db5a7356d11a'} + + # Or create an instance of the data source, modify the resource and run the source. + + # 1. Create an instance of the source so you can edit it. + data_source = dummy_source() + # 2. Modify this source instance's resource + data_source = data_source.dummy_data().add_map(replace_umlauts_in_dict_keys) + # 3. Inspect your result + for row in data_source: + print(row) + + pipeline = dlt.pipeline(pipeline_name='example', destination='bigquery', dataset_name='normalized_data') + load_info = pipeline.run(data_source) +``` diff --git a/server/continuedev/plugins/recipes/AddTransformRecipe/main.py b/server/continuedev/plugins/recipes/AddTransformRecipe/main.py new file mode 100644 index 00000000..583cef1a --- /dev/null +++ b/server/continuedev/plugins/recipes/AddTransformRecipe/main.py @@ -0,0 +1,31 @@ +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK +from ....core.steps import MessageStep, WaitForUserInputStep +from .steps import AddTransformStep, SetUpChessPipelineStep + + +class AddTransformRecipe(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + text_observation = await sdk.run_step( + MessageStep( + message=dedent( + """\ + This recipe will walk you through the process of adding a transform to a dlt pipeline that uses the chess.com API source. With the help of Continue, you will: + - Set up a dlt pipeline for the chess.com API + - Add a filter or map transform to the pipeline + - Run the pipeline and view the transformed data in a Streamlit app""" + ), + name="Add transformation to a dlt pipeline", + ) + >> SetUpChessPipelineStep() + >> WaitForUserInputStep( + prompt="How do you want to transform the Chess.com API data before loading it? For example, you could filter out games that ended in a draw." + ) + ) + await sdk.run_step( + AddTransformStep(transform_description=text_observation.text) + ) diff --git a/server/continuedev/plugins/recipes/AddTransformRecipe/steps.py b/server/continuedev/plugins/recipes/AddTransformRecipe/steps.py new file mode 100644 index 00000000..61638374 --- /dev/null +++ b/server/continuedev/plugins/recipes/AddTransformRecipe/steps.py @@ -0,0 +1,106 @@ +import os +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK, Models +from ....core.steps import MessageStep +from ....libs.util.paths import find_data_file + +AI_ASSISTED_STRING = "(✨ AI-Assisted ✨)" + + +class SetUpChessPipelineStep(Step): + hide: bool = True + name: str = "Setup Chess.com API dlt Pipeline" + + async def describe(self, models: Models): + return "This step will create a new dlt pipeline that loads data from the chess.com API." + + async def run(self, sdk: ContinueSDK): + # running commands to get started when creating a new dlt pipeline + await sdk.run( + [ + "python3 -m venv .env", + "source .env/bin/activate", + "pip install dlt", + "dlt --non-interactive init chess duckdb", + "pip install -r requirements.txt", + "pip install pandas streamlit", # Needed for the pipeline show step later + ], + name="Set up Python environment", + description=dedent( + """\ + - Create a Python virtual environment: `python3 -m venv .env` + - Activate the virtual environment: `source .env/bin/activate` + - Install dlt: `pip install dlt` + - Create a new dlt pipeline called "chess" that loads data into a local DuckDB instance: `dlt init chess duckdb` + - Install the Python dependencies for the pipeline: `pip install -r requirements.txt`""" + ), + ) + + +class AddTransformStep(Step): + hide: bool = True + + # e.g. "Use the `python-chess` library to decode the moves in the game data" + transform_description: str + + async def run(self, sdk: ContinueSDK): + source_name = "chess" + filename = f"{source_name}_pipeline.py" + abs_filepath = os.path.join(sdk.ide.workspace_directory, filename) + + # Open the file and highlight the function to be edited + await sdk.ide.setFileOpen(abs_filepath) + + await sdk.run_step( + MessageStep( + message=dedent( + """\ + This step will customize your resource function with a transform of your choice: + - Add a filter or map transformation depending on your request + - Load the data into a local DuckDB instance + - Open up a Streamlit app for you to view the data""" + ), + name="Write transformation function", + ) + ) + + with open(find_data_file("dlt_transform_docs.md")) as f: + dlt_transform_docs = f.read() + + prompt = dedent( + f"""\ + Task: Write a transform function using the description below and then use `add_map` or `add_filter` from the `dlt` library to attach it a resource. + + Description: {self.transform_description} + + Here are some docs pages that will help you better understand how to use `dlt`. + + {dlt_transform_docs}""" + ) + + # edit the pipeline to add a transform function and attach it to a resource + await sdk.edit_file( + filename=filename, + prompt=prompt, + name=f"Writing transform function {AI_ASSISTED_STRING}", + ) + + await sdk.wait_for_user_confirmation( + "Press Continue to confirm that the changes are okay before we run the pipeline." + ) + + # run the pipeline and load the data + await sdk.run( + f"python3 {filename}", + name="Run the pipeline", + description=f"Running `python3 {filename}` to load the data into a local DuckDB instance", + ) + + # run a streamlit app to show the data + await sdk.run( + f"dlt pipeline {source_name}_pipeline show", + name="Show data in a Streamlit app", + description=f"Running `dlt pipeline {source_name} show` to show the data in a Streamlit app, where you can view and play with the data.", + ) diff --git a/server/continuedev/plugins/recipes/ContinueRecipeRecipe/README.md b/server/continuedev/plugins/recipes/ContinueRecipeRecipe/README.md new file mode 100644 index 00000000..df66104f --- /dev/null +++ b/server/continuedev/plugins/recipes/ContinueRecipeRecipe/README.md @@ -0,0 +1,7 @@ +# ContinueRecipeRecipe + +A recipe for building recipes! + +## How to use this recipe + +This recipe takes a single input, a description of the recipe to be built. diff --git a/server/continuedev/plugins/recipes/ContinueRecipeRecipe/main.py b/server/continuedev/plugins/recipes/ContinueRecipeRecipe/main.py new file mode 100644 index 00000000..3dff2e15 --- /dev/null +++ b/server/continuedev/plugins/recipes/ContinueRecipeRecipe/main.py @@ -0,0 +1,43 @@ +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK +from ....plugins.steps.main import EditHighlightedCodeStep + + +class ContinueStepStep(Step): + name: str = "Write your own Continue Step." + prompt: str + + async def run(self, sdk: ContinueSDK): + await sdk.run_step( + EditHighlightedCodeStep( + user_input=dedent( + f"""\ + Here is an example of a Step that runs a command and then edits a file. + + ```python + from ...core.main import Step + from ...core.sdk import ContinueSDK + + class RunCommandAndEditFileStep(Step): + name: str = "Run a command and then edit a file." + command: str + file_path: str + prompt: str + + async def run(self, sdk: ContinueSDK): + await sdk.run([command]) + await sdk.edit_file(filename=self.file_path, prompt=self.prompt) + ``` + + Please edit the code to write your own Step that does the following: + + {self.prompt} + + It should be a subclass of Step as above, implementing the `run` method, and using pydantic attributes to define the parameters. + + """ + ) + ) + ) diff --git a/server/continuedev/plugins/recipes/CreatePipelineRecipe/README.md b/server/continuedev/plugins/recipes/CreatePipelineRecipe/README.md new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/server/continuedev/plugins/recipes/CreatePipelineRecipe/README.md diff --git a/server/continuedev/plugins/recipes/CreatePipelineRecipe/main.py b/server/continuedev/plugins/recipes/CreatePipelineRecipe/main.py new file mode 100644 index 00000000..56e6f055 --- /dev/null +++ b/server/continuedev/plugins/recipes/CreatePipelineRecipe/main.py @@ -0,0 +1,40 @@ +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK +from ....core.steps import MessageStep, WaitForUserInputStep +from .steps import RunQueryStep, SetupPipelineStep, ValidatePipelineStep + + +class CreatePipelineRecipe(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + text_observation = await sdk.run_step( + MessageStep( + name="Building your first dlt pipeline", + message=dedent( + """\ + This recipe will walk you through the process of creating a dlt pipeline for your chosen data source. With the help of Continue, you will: + - Create a Python virtual environment with dlt installed + - Run `dlt init` to generate a pipeline template + - Write the code to call the API + - Add any required API keys to the `secrets.toml` file + - Test that the API call works + - Load the data into a local DuckDB instance + - Write a query to view the data""" + ), + ) + >> WaitForUserInputStep( + prompt="What API do you want to load data from? (e.g. weatherapi.com, chess.com)" + ) + ) + await sdk.run_step( + SetupPipelineStep(api_description=text_observation.text) + >> ValidatePipelineStep() + >> RunQueryStep() + >> MessageStep( + name="Congrats!", + message="You've successfully created your first dlt pipeline! 🎉", + ) + ) diff --git a/server/continuedev/plugins/recipes/CreatePipelineRecipe/steps.py b/server/continuedev/plugins/recipes/CreatePipelineRecipe/steps.py new file mode 100644 index 00000000..65e7182d --- /dev/null +++ b/server/continuedev/plugins/recipes/CreatePipelineRecipe/steps.py @@ -0,0 +1,243 @@ +import os +import time +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK, Models +from ....core.steps import MessageStep +from ....models.filesystem import RangeInFile +from ....models.filesystem_edit import AddFile, FileEdit +from ....models.main import Range + +AI_ASSISTED_STRING = "(✨ AI-Assisted ✨)" + + +class SetupPipelineStep(Step): + hide: bool = True + name: str = "Setup dlt Pipeline" + + api_description: str # e.g. "I want to load data from the weatherapi.com API" + + async def describe(self, models: Models): + return dedent( + f"""\ + This step will create a new dlt pipeline that loads data from an API, as per your request: + {self.api_description} + """ + ) + + async def run(self, sdk: ContinueSDK): + sdk.context.set("api_description", self.api_description) + + source_name = ( + await sdk.models.summarize.complete( + f"Write a snake_case name for the data source described by {self.api_description}: " + ) + ).strip() + filename = f"{source_name}.py" + + # running commands to get started when creating a new dlt pipeline + await sdk.run( + [ + "python3 -m venv .env", + "source .env/bin/activate", + "pip install dlt", + f"dlt --non-interactive init {source_name} duckdb", + "pip install -r requirements.txt", + ], + description=dedent( + f"""\ + Running the following commands: + - `python3 -m venv .env`: Create a Python virtual environment + - `source .env/bin/activate`: Activate the virtual environment + - `pip install dlt`: Install dlt + - `dlt init {source_name} duckdb`: Create a new dlt pipeline called {source_name} that loads data into a local DuckDB instance + - `pip install -r requirements.txt`: Install the Python dependencies for the pipeline""" + ), + name="Setup Python environment", + ) + + # editing the resource function to call the requested API + resource_function_range = Range.from_shorthand(15, 0, 30, 0) + await sdk.ide.highlightCode( + RangeInFile( + filepath=os.path.join(await sdk.ide.getWorkspaceDirectory(), filename), + range=resource_function_range, + ), + "#ffa50033", + ) + + # sdk.set_loading_message("Writing code to call the API...") + await sdk.edit_file( + range=resource_function_range, + filename=filename, + prompt=f"Edit the resource function to call the API described by this: {self.api_description}. Do not move or remove the exit() call in __main__.", + name=f"Edit the resource function to call the API {AI_ASSISTED_STRING}", + ) + + time.sleep(1) + + # wait for user to put API key in secrets.toml + await sdk.ide.setFileOpen( + await sdk.ide.getWorkspaceDirectory() + "/.dlt/secrets.toml" + ) + await sdk.wait_for_user_confirmation( + "If this service requires an API key, please add it to the `secrets.toml` file and then press `Continue`." + ) + + sdk.context.set("source_name", source_name) + + +class ValidatePipelineStep(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + workspace_dir = await sdk.ide.getWorkspaceDirectory() + source_name = sdk.context.get("source_name") + filename = f"{source_name}.py" + + # await sdk.run_step(MessageStep(name="Validate the pipeline", message=dedent("""\ + # Next, we will validate that your dlt pipeline is working as expected: + # - Test that the API call works + # - Load the data into a local DuckDB instance + # - Write a query to view the data + # """))) + + # test that the API call works + output = await sdk.run( + f"python3 {filename}", + name="Test the pipeline", + description=f"Running `python3 {filename}` to test loading data from the API", + handle_error=False, + ) + + # If it fails, return the error + if "Traceback" in output or "SyntaxError" in output: + output = "Traceback" + output.split("Traceback")[-1] + file_content = await sdk.ide.readFile(os.path.join(workspace_dir, filename)) + suggestion = await sdk.models.summarize.complete( + dedent( + f"""\ + ```python + {file_content} + ``` + This above code is a dlt pipeline that loads data from an API. The function with the @resource decorator is responsible for calling the API and returning the data. While attempting to run the pipeline, the following error occurred: + + ```ascii + {output} + ``` + + This is a brief summary of the error followed by a suggestion on how it can be fixed by editing the resource function:""" + ) + ) + + api_documentation_url = await sdk.models.summarize.complete( + dedent( + f"""\ + The API I am trying to call is the '{sdk.context.get('api_description')}'. I tried calling it in the @resource function like this: + ```python + {file_content} + ``` + What is the URL for the API documentation that will help me learn how to make this call? Please format in markdown so I can click the link.""" + ) + ) + + sdk.raise_exception( + title=f"Error while running pipeline.\nFix the resource function in {filename} and rerun this step", + message=output, + with_step=MessageStep( + name=f"Suggestion to solve error {AI_ASSISTED_STRING}", + message=dedent( + f"""\ + {suggestion} + + {api_documentation_url} + + After you've fixed the code, click the retry button at the top of the Validate Pipeline step above.""" + ), + ), + ) + + # remove exit() from the main main function + await sdk.run_step( + MessageStep( + name="Remove early exit() from main function", + message="Remove the early exit() from the main function now that we are done testing and want the pipeline to load the data into DuckDB.", + ) + ) + + contents = await sdk.ide.readFile(os.path.join(workspace_dir, filename)) + replacement = "\n".join( + list(filter(lambda line: line.strip() != "exit()", contents.split("\n"))) + ) + await sdk.ide.applyFileSystemEdit( + FileEdit( + filepath=os.path.join(workspace_dir, filename), + replacement=replacement, + range=Range.from_entire_file(contents), + ) + ) + + # load the data into the DuckDB instance + await sdk.run( + f"python3 {filename}", + name="Load data into DuckDB", + description=f"Running python3 {filename} to load data into DuckDB", + ) + + tables_query_code = dedent( + f"""\ + import duckdb + + # connect to DuckDB instance + conn = duckdb.connect(database="{source_name}.duckdb") + + # list all tables + print(conn.sql("DESCRIBE"))""" + ) + + query_filename = os.path.join(workspace_dir, "query.py") + await sdk.apply_filesystem_edit( + AddFile(filepath=query_filename, content=tables_query_code), + name="Add query.py file", + description="Adding a file called `query.py` to the workspace that will run a test query on the DuckDB instance", + ) + + +class RunQueryStep(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + output = await sdk.run( + ".env/bin/python3 query.py", + name="Run test query", + description="Running `.env/bin/python3 query.py` to test that the data was loaded into DuckDB as expected", + handle_error=False, + ) + + if "Traceback" in output or "SyntaxError" in output: + suggestion = await sdk.models.summarize.complete( + dedent( + f"""\ + ```python + {await sdk.ide.readFile(os.path.join(sdk.ide.workspace_directory, "query.py"))} + ``` + This above code is a query that runs on the DuckDB instance. While attempting to run the query, the following error occurred: + + ```ascii + {output} + ``` + + This is a brief summary of the error followed by a suggestion on how it can be fixed:""" + ) + ) + + sdk.raise_exception( + title="Error while running query", + message=output, + with_step=MessageStep( + name=f"Suggestion to solve error {AI_ASSISTED_STRING}", + message=suggestion + + "\n\nIt is also very likely that no duckdb table was created, which can happen if the resource function did not yield any data. Please make sure that it is yielding data and then rerun this step.", + ), + ) diff --git a/server/continuedev/plugins/recipes/DDtoBQRecipe/README.md b/server/continuedev/plugins/recipes/DDtoBQRecipe/README.md new file mode 100644 index 00000000..d50324f7 --- /dev/null +++ b/server/continuedev/plugins/recipes/DDtoBQRecipe/README.md @@ -0,0 +1,3 @@ +# DDtoBQRecipe + +Move from using DuckDB to Google BigQuery as the destination for your `dlt` pipeline diff --git a/server/continuedev/plugins/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md b/server/continuedev/plugins/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md new file mode 100644 index 00000000..eb68e117 --- /dev/null +++ b/server/continuedev/plugins/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md @@ -0,0 +1,85 @@ +### Credentials Missing: ConfigFieldMissingException + +You'll see this exception if `dlt` cannot find your bigquery credentials. In the exception below all of them ('project_id', 'private_key', 'client_email') are missing. The exception gives you also the list of all lookups for configuration performed - [here we explain how to read such list](run-a-pipeline.md#missing-secret-or-configuration-values). + +``` +dlt.common.configuration.exceptions.ConfigFieldMissingException: Following fields are missing: ['project_id', 'private_key', 'client_email'] in configuration with spec GcpServiceAccountCredentials + for field "project_id" config providers and keys were tried in following order: + In Environment Variables key WEATHERAPI__DESTINATION__BIGQUERY__CREDENTIALS__PROJECT_ID was not found. + In Environment Variables key WEATHERAPI__DESTINATION__CREDENTIALS__PROJECT_ID was not found. +``` + +The most common cases for the exception: + +1. The secrets are not in `secrets.toml` at all +2. The are placed in wrong section. For example the fragment below will not work: + +```toml +[destination.bigquery] +project_id = "project_id" # please set me up! +``` + +3. You run the pipeline script from the **different** folder from which it is saved. For example `python weatherapi_demo/weatherapi.py` will run the script from `weatherapi_demo` folder but the current working directory is folder above. This prevents `dlt` from finding `weatherapi_demo/.dlt/secrets.toml` and filling-in credentials. + +### Placeholders still in secrets.toml + +Here BigQuery complain that the format of the `private_key` is incorrect. Practically this most often happens if you forgot to replace the placeholders in `secrets.toml` with real values + +``` +<class 'dlt.destinations.exceptions.DestinationConnectionError'> +Connection with BigQuerySqlClient to dataset name weatherapi_data failed. Please check if you configured the credentials at all and provided the right credentials values. You can be also denied access or your internet connection may be down. The actual reason given is: No key could be detected. +``` + +### Bigquery not enabled + +[You must enable Bigquery API.](https://console.cloud.google.com/apis/dashboard) + +``` +<class 'google.api_core.exceptions.Forbidden'> +403 POST https://bigquery.googleapis.com/bigquery/v2/projects/bq-walkthrough/jobs?prettyPrint=false: BigQuery API has not been used in project 364286133232 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/bigquery.googleapis.com/overview?project=364286133232 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. + +Location: EU +Job ID: a5f84253-3c10-428b-b2c8-1a09b22af9b2 + [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Google developers console API activation', 'url': 'https://console.developers.google.com/apis/api/bigquery.googleapis.com/overview?project=364286133232'}]}, {'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'SERVICE_DISABLED', 'domain': 'googleapis.com', 'metadata': {'service': 'bigquery.googleapis.com', 'consumer': 'projects/364286133232'}}] +``` + +### Lack of permissions to create jobs + +Add `BigQuery Job User` as described in the [destination page](../destinations/bigquery.md). + +``` +<class 'google.api_core.exceptions.Forbidden'> +403 POST https://bigquery.googleapis.com/bigquery/v2/projects/bq-walkthrough/jobs?prettyPrint=false: Access Denied: Project bq-walkthrough: User does not have bigquery.jobs.create permission in project bq-walkthrough. + +Location: EU +Job ID: c1476d2c-883c-43f7-a5fe-73db195e7bcd +``` + +### Lack of permissions to query/write data + +Add `BigQuery Data Editor` as described in the [destination page](../destinations/bigquery.md). + +``` +<class 'dlt.destinations.exceptions.DatabaseTransientException'> +403 Access Denied: Table bq-walkthrough:weatherapi_data._dlt_loads: User does not have permission to query table bq-walkthrough:weatherapi_data._dlt_loads, or perhaps it does not exist in location EU. + +Location: EU +Job ID: 299a92a3-7761-45dd-a433-79fdeb0c1a46 +``` + +### Lack of billing / BigQuery in sandbox mode + +`dlt` does not support BigQuery when project has no billing enabled. If you see a stack trace where following warning appears: + +``` +<class 'dlt.destinations.exceptions.DatabaseTransientException'> +403 Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. DML queries are not allowed in the free tier. Set up a billing account to remove this restriction. +``` + +or + +``` +2023-06-08 16:16:26,769|[WARNING ]|8096|dlt|load.py|complete_jobs:198|Job for weatherapi_resource_83b8ac9e98_4_jsonl retried in load 1686233775.932288 with message {"error_result":{"reason":"billingNotEnabled","message":"Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. Table expiration time must be less than 60 days while in sandbox mode."},"errors":[{"reason":"billingNotEnabled","message":"Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. Table expiration time must be less than 60 days while in sandbox mode."}],"job_start":"2023-06-08T14:16:26.850000Z","job_end":"2023-06-08T14:16:26.850000Z","job_id":"weatherapi_resource_83b8ac9e98_4_jsonl"} +``` + +you must enable the billing. diff --git a/server/continuedev/plugins/recipes/DDtoBQRecipe/main.py b/server/continuedev/plugins/recipes/DDtoBQRecipe/main.py new file mode 100644 index 00000000..65149500 --- /dev/null +++ b/server/continuedev/plugins/recipes/DDtoBQRecipe/main.py @@ -0,0 +1,31 @@ +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK +from ....core.steps import MessageStep +from .steps import LoadDataStep, SetUpChessPipelineStep, SwitchDestinationStep + +# Based on the following guide: +# https://github.com/dlt-hub/dlt/pull/392 + + +class DDtoBQRecipe(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + await sdk.run_step( + MessageStep( + name="Move from using DuckDB to Google BigQuery as the destination", + message=dedent( + """\ + This recipe will walk you through the process of moving from using DuckDB to Google BigQuery as the destination for your dlt pipeline. With the help of Continue, you will: + - Set up a dlt pipeline for the chess.com API + - Switch destination from DuckDB to Google BigQuery + - Add BigQuery credentials to your secrets.toml file + - Run the pipeline again to load data to BigQuery""" + ), + ) + >> SetUpChessPipelineStep() + >> SwitchDestinationStep() + >> LoadDataStep() + ) diff --git a/server/continuedev/plugins/recipes/DDtoBQRecipe/steps.py b/server/continuedev/plugins/recipes/DDtoBQRecipe/steps.py new file mode 100644 index 00000000..dfe25d9e --- /dev/null +++ b/server/continuedev/plugins/recipes/DDtoBQRecipe/steps.py @@ -0,0 +1,119 @@ +import os +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK, Models +from ....core.steps import MessageStep +from ....libs.util.paths import find_data_file +from ....plugins.steps.find_and_replace import FindAndReplaceStep + +AI_ASSISTED_STRING = "(✨ AI-Assisted ✨)" + + +class SetUpChessPipelineStep(Step): + hide: bool = True + name: str = "Setup Chess.com API dlt Pipeline" + + async def describe(self, models: Models): + return "This step will create a new dlt pipeline that loads data from the chess.com API." + + async def run(self, sdk: ContinueSDK): + # running commands to get started when creating a new dlt pipeline + await sdk.run( + [ + "python3 -m venv .env", + "source .env/bin/activate", + "pip install dlt", + "dlt --non-interactive init chess duckdb", + "pip install -r requirements.txt", + ], + name="Set up Python environment", + description=dedent( + """\ + Running the following commands: + - `python3 -m venv .env`: Create a Python virtual environment + - `source .env/bin/activate`: Activate the virtual environment + - `pip install dlt`: Install dlt + - `dlt init chess duckdb`: Create a new dlt pipeline called "chess" that loads data into a local DuckDB instance + - `pip install -r requirements.txt`: Install the Python dependencies for the pipeline""" + ), + ) + + +class SwitchDestinationStep(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + # Switch destination from DuckDB to Google BigQuery + filepath = os.path.join(sdk.ide.workspace_directory, "chess_pipeline.py") + await sdk.run_step( + FindAndReplaceStep( + filepath=filepath, + pattern="destination='duckdb'", + replacement="destination='bigquery'", + ) + ) + + # Add BigQuery credentials to your secrets.toml file + template = dedent( + """\ + [destination.bigquery.credentials] + location = "US" # change the location of the data + project_id = "project_id" # please set me up! + private_key = "private_key" # please set me up! + client_email = "client_email" # please set me up!""" + ) + + # wait for user to put API key in secrets.toml + secrets_path = os.path.join(sdk.ide.workspace_directory, ".dlt/secrets.toml") + await sdk.ide.setFileOpen(secrets_path) + await sdk.append_to_file(secrets_path, template) + + # append template to bottom of secrets.toml + await sdk.wait_for_user_confirmation( + "Please add your GCP credentials to `secrets.toml` file and then press `Continue`" + ) + + +class LoadDataStep(Step): + name: str = "Load data to BigQuery" + hide: bool = True + + async def run(self, sdk: ContinueSDK): + # Run the pipeline again to load data to BigQuery + output = await sdk.run( + ".env/bin/python3 chess_pipeline.py", + name="Load data to BigQuery", + description="Running `.env/bin/python3 chess_pipeline.py` to load data to Google BigQuery", + ) + + if "Traceback" in output or "SyntaxError" in output: + with open(find_data_file("dlt_duckdb_to_bigquery_docs.md"), "r") as f: + docs = f.read() + + output = "Traceback" + output.split("Traceback")[-1] + suggestion = await sdk.models.default.complete( + dedent( + f"""\ + When trying to load data into BigQuery, the following error occurred: + + ```ascii + {output} + ``` + + Here is documentation describing common errors and their causes/solutions: + + {docs} + + This is a brief summary of the error followed by a suggestion on how it can be fixed:""" + ) + ) + + sdk.raise_exception( + title="Error while running query", + message=output, + with_step=MessageStep( + name=f"Suggestion to solve error {AI_ASSISTED_STRING}", + message=suggestion, + ), + ) diff --git a/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/README.md b/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/README.md new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/README.md diff --git a/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/main.py b/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/main.py new file mode 100644 index 00000000..5b0bd320 --- /dev/null +++ b/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/main.py @@ -0,0 +1,86 @@ +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK +from ....core.steps import MessageStep +from ....plugins.steps.input.nl_multiselect import NLMultiselectStep +from .steps import DeployAirflowStep, RunPipelineStep, SetupPipelineStep + +# https://github.com/dlt-hub/dlt-deploy-template/blob/master/airflow-composer/dag_template.py +# https://www.notion.so/dlthub/Deploy-a-pipeline-with-Airflow-245fd1058652479494307ead0b5565f3 +# 1. What verified pipeline do you want to deploy with Airflow? +# 2. Set up selected verified pipeline +# 3. Deploy selected verified pipeline with Airflow +# 4. Set up Airflow locally? + + +class DeployPipelineAirflowRecipe(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + source_name = await sdk.run_step( + MessageStep( + name="Deploying a pipeline to Airflow", + message=dedent( + """\ + This recipe will show you how to deploy a pipeline to Airflow. With the help of Continue, you will: + - Select a dlt-verified pipeline + - Setup the pipeline + - Deploy it to Airflow + - Optionally, setup Airflow locally""" + ), + ) + >> NLMultiselectStep( + prompt=dedent( + """\ + Which verified pipeline do you want to deploy with Airflow? The options are: + - Asana + - Chess.com + - Facebook Ads + - GitHub + - Google Analytics + - Google Sheets + - HubSpot + - Jira + - Matomo + - Mux + - Notion + - Pipedrive + - Pokemon + - Salesforce + - Shopify + - Strapi + - Stripe + - SQL Database + - Workable + - Zendesk""" + ), + options=[ + "asana_dlt", + "chess", + "github", + "google_analytics", + "google_sheets", + "hubspot", + "matomo", + "pipedrive", + "shopify_dlt", + "strapi", + "zendesk", + "facebook_ads", + "jira", + "mux", + "notion", + "pokemon", + "salesforce", + "stripe_analytics", + "sql_database", + "workable", + ], + ) + ) + await sdk.run_step( + SetupPipelineStep(source_name=source_name) + >> RunPipelineStep(source_name=source_name) + >> DeployAirflowStep(source_name=source_name) + ) diff --git a/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/steps.py b/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/steps.py new file mode 100644 index 00000000..e4a932af --- /dev/null +++ b/server/continuedev/plugins/recipes/DeployPipelineAirflowRecipe/steps.py @@ -0,0 +1,125 @@ +import os +from textwrap import dedent + +from ....core.main import Step +from ....core.sdk import ContinueSDK, Models +from ....core.steps import MessageStep +from ....plugins.steps.find_and_replace import FindAndReplaceStep + +AI_ASSISTED_STRING = "(✨ AI-Assisted ✨)" + + +class SetupPipelineStep(Step): + hide: bool = True + name: str = "Setup dlt Pipeline" + + source_name: str + + async def describe(self, models: Models): + pass + + async def run(self, sdk: ContinueSDK): + await sdk.run( + [ + "python3 -m venv .env", + "source .env/bin/activate", + "pip install dlt", + f"dlt --non-interactive init {self.source_name} duckdb", + "pip install -r requirements.txt", + ], + description=dedent( + f"""\ + Running the following commands: + - `python3 -m venv .env`: Create a Python virtual environment + - `source .env/bin/activate`: Activate the virtual environment + - `pip install dlt`: Install dlt + - `dlt init {self.source_name} duckdb`: Create a new dlt pipeline called {self.source_name} that loads data into a local DuckDB instance + - `pip install -r requirements.txt`: Install the Python dependencies for the pipeline""" + ), + name="Setup Python environment", + ) + + +class RunPipelineStep(Step): + hide: bool = True + name: str = "Run dlt Pipeline" + + source_name: str + + async def describe(self, models: Models): + pass + + async def run(self, sdk: ContinueSDK): + await sdk.run( + [ + f"python3 {self.source_name}_pipeline.py", + ], + description=dedent( + f"""\ + Running the command `python3 {self.source_name}_pipeline.py to run the pipeline: """ + ), + name="Run dlt pipeline", + ) + + +class DeployAirflowStep(Step): + hide: bool = True + source_name: str + + async def run(self, sdk: ContinueSDK): + # Run dlt command to deploy pipeline to Airflow + await sdk.run( + [ + "git init", + f"dlt --non-interactive deploy {self.source_name}_pipeline.py airflow-composer", + ], + description="Running `dlt deploy airflow` to deploy the dlt pipeline to Airflow", + name="Deploy dlt pipeline to Airflow", + ) + + # Get filepaths, open the DAG file + directory = await sdk.ide.getWorkspaceDirectory() + pipeline_filepath = os.path.join(directory, f"{self.source_name}_pipeline.py") + dag_filepath = os.path.join( + directory, f"dags/dag_{self.source_name}_pipeline.py" + ) + + await sdk.ide.setFileOpen(dag_filepath) + + # Replace the pipeline name and dataset name + await sdk.run_step( + FindAndReplaceStep( + filepath=pipeline_filepath, + pattern="'pipeline_name'", + replacement=f"'{self.source_name}_pipeline'", + ) + ) + await sdk.run_step( + FindAndReplaceStep( + filepath=pipeline_filepath, + pattern="'dataset_name'", + replacement=f"'{self.source_name}_data'", + ) + ) + await sdk.run_step( + FindAndReplaceStep( + filepath=pipeline_filepath, + pattern="pipeline_or_source_script", + replacement=f"{self.source_name}_pipeline", + ) + ) + + # Prompt the user for the DAG schedule + # edit_dag_range = Range.from_shorthand(18, 0, 23, 0) + # await sdk.ide.highlightCode(range_in_file=RangeInFile(filepath=dag_filepath, range=edit_dag_range), color="#33993333") + # response = await sdk.run_step(WaitForUserInputStep(prompt="When would you like this Airflow DAG to run? (e.g. every day, every Monday, every 1st of the month, etc.)")) + # await sdk.edit_file(dag_filepath, prompt=f"Edit the DAG so that it runs at the following schedule: '{response.text}'", + # range=edit_dag_range) + + # Tell the user to check the schedule and fill in owner, email, other default_args + await sdk.run_step( + MessageStep( + message="Fill in the owner, email, and other default_args in the DAG file with your own personal information. Then the DAG will be ready to run!", + name="Fill in default_args", + ) + ) diff --git a/server/continuedev/plugins/recipes/README.md b/server/continuedev/plugins/recipes/README.md new file mode 100644 index 00000000..9860b0e2 --- /dev/null +++ b/server/continuedev/plugins/recipes/README.md @@ -0,0 +1,19 @@ +# This is a collaborative collection of Continue recipes + +A recipe is technically just a [Step](../steps/README.md), but is intended to be more complex, composed of multiple sub-steps. + +Recipes here will automatically be made available in the [Continue VS Code extension](https://marketplace.visualstudio.com/items?itemName=Continue.continue). + +The `recipes` folder contains all recipes, each with the same structure. **If you wish to create your own recipe, please do the following:** + +1. Create a new subfolder in `recipes`, with the name of your recipe (for example `MyNewRecipe`). +2. Make 2 files in this folder: 1) a `README.md` describing your recipe and how to use it and 2) a `main.py` including a single class with the name of your recipe (e.g. `MyNewRecipe`). +3. Write any utility code other than the main recipe class in a separate file, which you can import in `main.py`. Particularly if you decide to break the recipe into multiple sub-steps, try to keep these separate. + +# Existing Recipes + +`ContinueRecipeRecipe` - Write a Continue recipe with Continue. + +`CreatePipelineRecipe` - Build a dlt pipeline from scratch for an API of your choice. + +`WritePytestsRecipe` - Write Pytest unit tests in a folder adjacent to your Python file. diff --git a/server/continuedev/plugins/recipes/TemplateRecipe/README.md b/server/continuedev/plugins/recipes/TemplateRecipe/README.md new file mode 100644 index 00000000..91d1123b --- /dev/null +++ b/server/continuedev/plugins/recipes/TemplateRecipe/README.md @@ -0,0 +1,7 @@ +# TemplateRecipe + +This folder is a template that you can copy to create your own recipe. + +## How to use this recipe + +Explain here what users should know when using your recipe. What inputs does it have and what actions will it perform? diff --git a/server/continuedev/plugins/recipes/TemplateRecipe/main.py b/server/continuedev/plugins/recipes/TemplateRecipe/main.py new file mode 100644 index 00000000..01ae364d --- /dev/null +++ b/server/continuedev/plugins/recipes/TemplateRecipe/main.py @@ -0,0 +1,29 @@ +from typing import Coroutine + +from ....core.main import Observation, Step +from ....core.sdk import ContinueSDK, Models + + +class TemplateRecipe(Step): + """ + A simple recipe that appends a print statement to the currently open file. + Use this as a template to create your own! + """ + + # Parameters for the recipe + name: str + + # A title for the recipe, to be displayed in the GUI + title = "Template Recipe" + + # A description of what the recipe accomplished, to be displayed in the GUI + async def describe(self, models: Models) -> Coroutine[str, None, None]: + return f"Appended a statement to print `Hello, {self.name}!` at the end of the file." + + # The code executed when the recipe is run + async def run(self, sdk: ContinueSDK) -> Coroutine[Observation, None, None]: + visible_files = await sdk.ide.getVisibleFiles() + await sdk.edit_file( + filename=visible_files[0], + prompt=f"Append a statement to print `Hello, {self.name}!` at the end of the file.", + ) diff --git a/server/continuedev/plugins/recipes/WritePytestsRecipe/README.md b/server/continuedev/plugins/recipes/WritePytestsRecipe/README.md new file mode 100644 index 00000000..5ce33ecb --- /dev/null +++ b/server/continuedev/plugins/recipes/WritePytestsRecipe/README.md @@ -0,0 +1,7 @@ +# CreatePytestsRecipe + +A recipe for writing unit tests in Pytest. + +# How to use this recipe + +Call this recipe with a python file open that you would like to test. It will create tests in a `tests/` folder adjacent to the file with the test file given the same name prepended by `test_`. diff --git a/server/continuedev/plugins/recipes/WritePytestsRecipe/main.py b/server/continuedev/plugins/recipes/WritePytestsRecipe/main.py new file mode 100644 index 00000000..63edabc6 --- /dev/null +++ b/server/continuedev/plugins/recipes/WritePytestsRecipe/main.py @@ -0,0 +1,52 @@ +import os +from textwrap import dedent +from typing import Union + +from ....core.main import Step +from ....core.sdk import ContinueSDK +from ....models.filesystem_edit import AddDirectory, AddFile + + +class WritePytestsRecipe(Step): + for_filepath: Union[str, None] = None + user_input: str = "Write unit tests for this file." + + async def describe(self, models): + return f"Writing unit tests for {self.for_filepath}" + + async def run(self, sdk: ContinueSDK): + if self.for_filepath is None: + self.for_filepath = (await sdk.ide.getVisibleFiles())[0] + + filename = os.path.basename(self.for_filepath) + dirname = os.path.dirname(self.for_filepath) + + path_dir = os.path.join(dirname, "tests") + if not os.path.exists(path_dir): + await sdk.apply_filesystem_edit(AddDirectory(path=path_dir)) + + path = os.path.join(path_dir, f"test_{filename}") + if os.path.exists(path): + return None + + for_file_contents = await sdk.ide.readFile(self.for_filepath) + + prompt = dedent( + f"""\ + This is the file you will write unit tests for: + + ```python + {for_file_contents} + ``` + + Here are additional instructions: + + "{self.user_input}" + + Here is a complete set of pytest unit tests:""" + ) + tests = await sdk.models.summarize.complete(prompt) + + await sdk.apply_filesystem_edit(AddFile(filepath=path, content=tests)) + + return None |