From d4c1b33e4d5311286fc20cfc5b28f2e3571869b2 Mon Sep 17 00:00:00 2001 From: Tyler Dunn Date: Sun, 11 Jun 2023 18:18:33 -0700 Subject: initial approach --- .../src/continuedev/recipes/DDtoBQRecipe/README.md | 3 + .../DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md | 108 +++++++++++++++++++++ .../src/continuedev/recipes/DDtoBQRecipe/main.py | 27 ++++++ .../src/continuedev/recipes/DDtoBQRecipe/steps.py | 90 +++++++++++++++++ 4 files changed, 228 insertions(+) create mode 100644 continuedev/src/continuedev/recipes/DDtoBQRecipe/README.md create mode 100644 continuedev/src/continuedev/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md create mode 100644 continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py create mode 100644 continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py (limited to 'continuedev/src') diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/README.md b/continuedev/src/continuedev/recipes/DDtoBQRecipe/README.md new file mode 100644 index 00000000..c4981e56 --- /dev/null +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/README.md @@ -0,0 +1,3 @@ +# DDtoBQRecipe + +Move from using DuckDB to Google BigQuery as the destination for your `dlt` pipeline \ No newline at end of file diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md b/continuedev/src/continuedev/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md new file mode 100644 index 00000000..fce9a083 --- /dev/null +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md @@ -0,0 +1,108 @@ +--- +title: "Share a dataset: duckdb -> BigQuery" +description: Share a local dataset by moving it to BigQuery +keywords: [how to, share a dataset] +--- + +# Share a dataset: duckdb -> BigQuery +In previous walkthroughs you used the local stack to create and run your pipeline. This saved you the headache of setting up cloud account, credentials and often also money. Our choice for local "warehouse" is `duckdb`, fast, feature rich and working everywhere. However at some point you want to move to production or share the results with your colleagues. The local `duckdb` file is not sufficient for that! Let's move the dataset to BigQuery now. + +## 1. Replace the "destination" argument with "bigquery" +```python +if __name__=='__main__': + + # below we replaced "duckdb" in the "destination" argument with "bigquery" + pipeline = dlt.pipeline(pipeline_name='weatherapi', destination='bigquery', dataset_name='weatherapi_data') +``` +And that's it regarding the code modifications! If you run the script, `dlt` will create identical dataset you had in `duckdb` but in BigQuery. + +## 2. Enable access to BigQuery and obtain credentials. +Please [follow those steps](../destinations/bigquery.md) to enable `dlt` to write data to BigQuery. + +## 3. Add credentials to secrets.toml +Please add the following section to your `secrets.toml` file, use the credentials obtained from the previous step +```toml +[destination.bigquery.credentials] +location = "US" # change the location of the data +project_id = "project_id" # please set me up! +private_key = "private_key" # please set me up! +client_email = "client_email" # please set me up! +``` + +## 4. Run the pipeline again +``` +python weatherapi.py +``` +Head on to the next section if you see exceptions! + +## 5. Troubleshoot exceptions + +### Credentials Missing: ConfigFieldMissingException + +You'll see this exception if `dlt` cannot find your bigquery credentials. In the exception below all of them ('project_id', 'private_key', 'client_email') are missing. The exception gives you also the list of all lookups for configuration performed - [here we explain how to read such list](run-a-pipeline.md#missing-secret-or-configuration-values). +``` +dlt.common.configuration.exceptions.ConfigFieldMissingException: Following fields are missing: ['project_id', 'private_key', 'client_email'] in configuration with spec GcpServiceAccountCredentials + for field "project_id" config providers and keys were tried in following order: + In Environment Variables key WEATHERAPI__DESTINATION__BIGQUERY__CREDENTIALS__PROJECT_ID was not found. + In Environment Variables key WEATHERAPI__DESTINATION__CREDENTIALS__PROJECT_ID was not found. +``` +The most common cases for the exception: +1. The secrets are not in `secrets.toml` at all +2. The are placed in wrong section. For example the fragment below will not work: +```toml +[destination.bigquery] +project_id = "project_id" # please set me up! +``` +3. You run the pipeline script from the **different** folder from which it is saved. For example `python weatherapi_demo/weatherapi.py` will run the script from `weatherapi_demo` folder but the current working directory is folder above. This prevents `dlt` from finding `weatherapi_demo/.dlt/secrets.toml` and filling-in credentials. + +### Placeholders still in secrets.toml +Here BigQuery complain that the format of the `private_key` is incorrect. Practically this most often happens if you forgot to replace the placeholders in `secrets.toml` with real values + +``` + +Connection with BigQuerySqlClient to dataset name weatherapi_data failed. Please check if you configured the credentials at all and provided the right credentials values. You can be also denied access or your internet connection may be down. The actual reason given is: No key could be detected. +``` + +### Bigquery not enabled +[You must enable Bigquery API.](https://console.cloud.google.com/apis/dashboard) +``` + +403 POST https://bigquery.googleapis.com/bigquery/v2/projects/bq-walkthrough/jobs?prettyPrint=false: BigQuery API has not been used in project 364286133232 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/bigquery.googleapis.com/overview?project=364286133232 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. + +Location: EU +Job ID: a5f84253-3c10-428b-b2c8-1a09b22af9b2 + [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Google developers console API activation', 'url': 'https://console.developers.google.com/apis/api/bigquery.googleapis.com/overview?project=364286133232'}]}, {'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'SERVICE_DISABLED', 'domain': 'googleapis.com', 'metadata': {'service': 'bigquery.googleapis.com', 'consumer': 'projects/364286133232'}}] + ``` + +### Lack of permissions to create jobs +Add `BigQuery Job User` as described in the [destination page](../destinations/bigquery.md). +``` + +403 POST https://bigquery.googleapis.com/bigquery/v2/projects/bq-walkthrough/jobs?prettyPrint=false: Access Denied: Project bq-walkthrough: User does not have bigquery.jobs.create permission in project bq-walkthrough. + +Location: EU +Job ID: c1476d2c-883c-43f7-a5fe-73db195e7bcd +``` + +### Lack of permissions to query/write data +Add `BigQuery Data Editor` as described in the [destination page](../destinations/bigquery.md). +``` + +403 Access Denied: Table bq-walkthrough:weatherapi_data._dlt_loads: User does not have permission to query table bq-walkthrough:weatherapi_data._dlt_loads, or perhaps it does not exist in location EU. + +Location: EU +Job ID: 299a92a3-7761-45dd-a433-79fdeb0c1a46 +``` + +### Lack of billing / BigQuery in sandbox mode +`dlt` does not support BigQuery when project has no billing enabled. If you see a stack trace where following warning appears: +``` + +403 Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. DML queries are not allowed in the free tier. Set up a billing account to remove this restriction. +``` +or + +``` +2023-06-08 16:16:26,769|[WARNING ]|8096|dlt|load.py|complete_jobs:198|Job for weatherapi_resource_83b8ac9e98_4_jsonl retried in load 1686233775.932288 with message {"error_result":{"reason":"billingNotEnabled","message":"Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. Table expiration time must be less than 60 days while in sandbox mode."},"errors":[{"reason":"billingNotEnabled","message":"Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. Table expiration time must be less than 60 days while in sandbox mode."}],"job_start":"2023-06-08T14:16:26.850000Z","job_end":"2023-06-08T14:16:26.850000Z","job_id":"weatherapi_resource_83b8ac9e98_4_jsonl"} +``` +you must enable the billing. \ No newline at end of file diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py b/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py new file mode 100644 index 00000000..4aabdfdf --- /dev/null +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py @@ -0,0 +1,27 @@ +from textwrap import dedent + +from ...core.main import Step +from ...core.sdk import ContinueSDK +from ...steps.core.core import WaitForUserInputStep +from ...steps.main import MessageStep +from .steps import SetupPipelineStep, ValidatePipelineStep, RunQueryStep + +# Based on the following guide: +# https://github.com/dlt-hub/dlt/pull/392 + +class DDtoBQRecipeRecipe(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + text_observation = await sdk.run_step( + MessageStep(name="Move from using DuckDB to Google BigQuery as the destination", message=dedent("""\ + This recipe will walk you through the process of moving from using DuckDB to Google BigQuery as the destination for your dlt pipeline. With the help of Continue, you will: + - Set up a dlt pipeline for the chess.com API + - Switch destination from DuckDB to Google BigQuery + - Add BigQuery credentials to your secrets.toml file + - Run the pipeline again to load data to BigQuery""")) + ) + await sdk.run_step( + SetUpChessPipelineStep() >> + SwitchDestinationStep() + ) \ No newline at end of file diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py b/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py new file mode 100644 index 00000000..4a835e1a --- /dev/null +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py @@ -0,0 +1,90 @@ +import os +import subprocess +from textwrap import dedent +import time + +from ...models.main import Range +from ...models.filesystem import RangeInFile +from ...steps.main import MessageStep +from ...core.sdk import Models +from ...core.observation import DictObservation, InternalErrorObservation +from ...models.filesystem_edit import AddFile, FileEdit +from ...core.main import Step +from ...core.sdk import ContinueSDK + +AI_ASSISTED_STRING = "(✨ AI-Assisted ✨)" + +class SetUpChessPipelineStep(Step): + hide: bool = True + name: str = "Setup Chess.com API dlt Pipeline" + + async def describe(self, models: Models): + return "This step will create a new dlt pipeline that loads data from the chess.com API." + + async def run(self, sdk: ContinueSDK): + + # running commands to get started when creating a new dlt pipeline + await sdk.run([ + 'python3 -m venv env', + 'source env/bin/activate', + 'pip install dlt', + 'dlt --non-interactive init chess duckdb', + 'pip install -r requirements.txt', + ], name="Set up Python environment", description=dedent(f"""\ + Running the following commands: + - `python3 -m venv env`: Create a Python virtual environment + - `source env/bin/activate`: Activate the virtual environment + - `pip install dlt`: Install dlt + - `dlt init chess duckdb`: Create a new dlt pipeline called "chess" that loads data into a local DuckDB instance + - `pip install -r requirements.txt`: Install the Python dependencies for the pipeline""")) + + +class SwitchDestinationStep(Step): + hide: bool = True + + async def run(self, sdk: ContinueSDK): + + # Switch destination from DuckDB to Google BigQuery + filename = 'chess.py' + prompt = 'Replace the "destination" argument with "bigquery"' + + ## edit the pipeline to add a tranform function and attach it to a resource + await sdk.edit_file( + filename=filename, + prompt=prompt, + name=f'Replacing the "destination" argument with "bigquery" {AI_ASSISTED_STRING}' + ) + + # Add BigQuery credentials to your secrets.toml file + template = dedent(f"""\ + [destination.bigquery.credentials] + location = "US" # change the location of the data + project_id = "project_id" # please set me up! + private_key = "private_key" # please set me up! + client_email = "client_email" # please set me up!""") + + ## wait for user to put API key in secrets.toml + await sdk.ide.setFileOpen(await sdk.ide.getWorkspaceDirectory() + "/.dlt/secrets.toml") + ## append template to bottom of secrets.toml + await sdk.wait_for_user_confirmation("Please add your GCP credentials to `secrets.toml` file and then press `Continue`") + + # Run the pipeline again to load data to BigQuery + output = await sdk.run('env/bin/python3 chess.py', name="Load data to BigQuery", description="Running `env/bin/python3 chess.py` to load data to Google BigQuery") + + ## TODO: REPLACE WITH APPROACH TO HELPING WITH THINGS MENTIONED IN `## 5. Troubleshoot exceptions` + if "Traceback" in output or "SyntaxError" in output: + suggestion = sdk.models.gpt35.complete(dedent(f"""\ + ```python + {await sdk.ide.readFile(os.path.join(sdk.ide.workspace_directory, "query.py"))} + ``` + This above code is a query that runs on the DuckDB instance. While attempting to run the query, the following error occurred: + + ```ascii + {output} + ``` + + This is a brief summary of the error followed by a suggestion on how it can be fixed:""")) + + sdk.raise_exception( + title="Error while running query", message=output, with_step=MessageStep(name=f"Suggestion to solve error {AI_ASSISTED_STRING}", message=suggestion) + ) -- cgit v1.2.3-70-g09d2 From 904a5adccad2ff49370082c32dc90457898fd6d4 Mon Sep 17 00:00:00 2001 From: Nate Sesti Date: Sun, 11 Jun 2023 22:48:34 -0700 Subject: cleaning bq recipe --- continuedev/src/continuedev/core/autopilot.py | 1 + continuedev/src/continuedev/core/policy.py | 3 ++ .../DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md | 59 +++++++--------------- .../src/continuedev/recipes/DDtoBQRecipe/main.py | 2 +- .../src/continuedev/recipes/DDtoBQRecipe/steps.py | 35 ++++++++----- .../src/continuedev/steps/steps_on_startup.py | 4 +- 6 files changed, 48 insertions(+), 56 deletions(-) (limited to 'continuedev/src') diff --git a/continuedev/src/continuedev/core/autopilot.py b/continuedev/src/continuedev/core/autopilot.py index b82e1fef..c979d53a 100644 --- a/continuedev/src/continuedev/core/autopilot.py +++ b/continuedev/src/continuedev/core/autopilot.py @@ -35,6 +35,7 @@ class Autopilot(ContinueBaseModel): class Config: arbitrary_types_allowed = True + keep_untouched = (cached_property,) def get_full_state(self) -> FullState: return FullState(history=self.history, active=self._active, user_input_queue=self._main_user_input_queue) diff --git a/continuedev/src/continuedev/core/policy.py b/continuedev/src/continuedev/core/policy.py index 8aea8de7..8612d834 100644 --- a/continuedev/src/continuedev/core/policy.py +++ b/continuedev/src/continuedev/core/policy.py @@ -9,6 +9,7 @@ from ..steps.main import EditHighlightedCodeStep, SolveTracebackStep, RunCodeSte from ..recipes.WritePytestsRecipe.main import WritePytestsRecipe from ..recipes.ContinueRecipeRecipe.main import ContinueStepStep from ..steps.comment_code import CommentCodeStep +from ..recipes.DDtoBQRecipe.main import DDtoBQRecipeRecipe class DemoPolicy(Policy): @@ -30,6 +31,8 @@ class DemoPolicy(Policy): return WritePytestsRecipe(instructions=observation.user_input) elif "/dlt" in observation.user_input.lower() or " dlt" in observation.user_input.lower(): return CreatePipelineRecipe() + elif "/ddtobq" in observation.user_input.lower(): + return DDtoBQRecipeRecipe() elif "/comment" in observation.user_input.lower(): return CommentCodeStep() elif "/ask" in observation.user_input: diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md b/continuedev/src/continuedev/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md index fce9a083..eb68e117 100644 --- a/continuedev/src/continuedev/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/dlt_duckdb_to_bigquery_docs.md @@ -1,61 +1,28 @@ ---- -title: "Share a dataset: duckdb -> BigQuery" -description: Share a local dataset by moving it to BigQuery -keywords: [how to, share a dataset] ---- - -# Share a dataset: duckdb -> BigQuery -In previous walkthroughs you used the local stack to create and run your pipeline. This saved you the headache of setting up cloud account, credentials and often also money. Our choice for local "warehouse" is `duckdb`, fast, feature rich and working everywhere. However at some point you want to move to production or share the results with your colleagues. The local `duckdb` file is not sufficient for that! Let's move the dataset to BigQuery now. - -## 1. Replace the "destination" argument with "bigquery" -```python -if __name__=='__main__': - - # below we replaced "duckdb" in the "destination" argument with "bigquery" - pipeline = dlt.pipeline(pipeline_name='weatherapi', destination='bigquery', dataset_name='weatherapi_data') -``` -And that's it regarding the code modifications! If you run the script, `dlt` will create identical dataset you had in `duckdb` but in BigQuery. - -## 2. Enable access to BigQuery and obtain credentials. -Please [follow those steps](../destinations/bigquery.md) to enable `dlt` to write data to BigQuery. - -## 3. Add credentials to secrets.toml -Please add the following section to your `secrets.toml` file, use the credentials obtained from the previous step -```toml -[destination.bigquery.credentials] -location = "US" # change the location of the data -project_id = "project_id" # please set me up! -private_key = "private_key" # please set me up! -client_email = "client_email" # please set me up! -``` - -## 4. Run the pipeline again -``` -python weatherapi.py -``` -Head on to the next section if you see exceptions! - -## 5. Troubleshoot exceptions - ### Credentials Missing: ConfigFieldMissingException You'll see this exception if `dlt` cannot find your bigquery credentials. In the exception below all of them ('project_id', 'private_key', 'client_email') are missing. The exception gives you also the list of all lookups for configuration performed - [here we explain how to read such list](run-a-pipeline.md#missing-secret-or-configuration-values). + ``` dlt.common.configuration.exceptions.ConfigFieldMissingException: Following fields are missing: ['project_id', 'private_key', 'client_email'] in configuration with spec GcpServiceAccountCredentials for field "project_id" config providers and keys were tried in following order: In Environment Variables key WEATHERAPI__DESTINATION__BIGQUERY__CREDENTIALS__PROJECT_ID was not found. In Environment Variables key WEATHERAPI__DESTINATION__CREDENTIALS__PROJECT_ID was not found. ``` + The most common cases for the exception: + 1. The secrets are not in `secrets.toml` at all 2. The are placed in wrong section. For example the fragment below will not work: + ```toml [destination.bigquery] project_id = "project_id" # please set me up! ``` + 3. You run the pipeline script from the **different** folder from which it is saved. For example `python weatherapi_demo/weatherapi.py` will run the script from `weatherapi_demo` folder but the current working directory is folder above. This prevents `dlt` from finding `weatherapi_demo/.dlt/secrets.toml` and filling-in credentials. ### Placeholders still in secrets.toml + Here BigQuery complain that the format of the `private_key` is incorrect. Practically this most often happens if you forgot to replace the placeholders in `secrets.toml` with real values ``` @@ -64,7 +31,9 @@ Connection with BigQuerySqlClient to dataset name weatherapi_data failed. Please ``` ### Bigquery not enabled + [You must enable Bigquery API.](https://console.cloud.google.com/apis/dashboard) + ``` 403 POST https://bigquery.googleapis.com/bigquery/v2/projects/bq-walkthrough/jobs?prettyPrint=false: BigQuery API has not been used in project 364286133232 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/bigquery.googleapis.com/overview?project=364286133232 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. @@ -72,10 +41,12 @@ Connection with BigQuerySqlClient to dataset name weatherapi_data failed. Please Location: EU Job ID: a5f84253-3c10-428b-b2c8-1a09b22af9b2 [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Google developers console API activation', 'url': 'https://console.developers.google.com/apis/api/bigquery.googleapis.com/overview?project=364286133232'}]}, {'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'SERVICE_DISABLED', 'domain': 'googleapis.com', 'metadata': {'service': 'bigquery.googleapis.com', 'consumer': 'projects/364286133232'}}] - ``` +``` ### Lack of permissions to create jobs + Add `BigQuery Job User` as described in the [destination page](../destinations/bigquery.md). + ``` 403 POST https://bigquery.googleapis.com/bigquery/v2/projects/bq-walkthrough/jobs?prettyPrint=false: Access Denied: Project bq-walkthrough: User does not have bigquery.jobs.create permission in project bq-walkthrough. @@ -85,7 +56,9 @@ Job ID: c1476d2c-883c-43f7-a5fe-73db195e7bcd ``` ### Lack of permissions to query/write data + Add `BigQuery Data Editor` as described in the [destination page](../destinations/bigquery.md). + ``` 403 Access Denied: Table bq-walkthrough:weatherapi_data._dlt_loads: User does not have permission to query table bq-walkthrough:weatherapi_data._dlt_loads, or perhaps it does not exist in location EU. @@ -95,14 +68,18 @@ Job ID: 299a92a3-7761-45dd-a433-79fdeb0c1a46 ``` ### Lack of billing / BigQuery in sandbox mode + `dlt` does not support BigQuery when project has no billing enabled. If you see a stack trace where following warning appears: + ``` 403 Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. DML queries are not allowed in the free tier. Set up a billing account to remove this restriction. ``` + or ``` 2023-06-08 16:16:26,769|[WARNING ]|8096|dlt|load.py|complete_jobs:198|Job for weatherapi_resource_83b8ac9e98_4_jsonl retried in load 1686233775.932288 with message {"error_result":{"reason":"billingNotEnabled","message":"Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. Table expiration time must be less than 60 days while in sandbox mode."},"errors":[{"reason":"billingNotEnabled","message":"Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. Table expiration time must be less than 60 days while in sandbox mode."}],"job_start":"2023-06-08T14:16:26.850000Z","job_end":"2023-06-08T14:16:26.850000Z","job_id":"weatherapi_resource_83b8ac9e98_4_jsonl"} ``` -you must enable the billing. \ No newline at end of file + +you must enable the billing. diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py b/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py index 4aabdfdf..1cb12ff3 100644 --- a/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py @@ -4,7 +4,7 @@ from ...core.main import Step from ...core.sdk import ContinueSDK from ...steps.core.core import WaitForUserInputStep from ...steps.main import MessageStep -from .steps import SetupPipelineStep, ValidatePipelineStep, RunQueryStep +from .steps import SetUpChessPipelineStep, SwitchDestinationStep # Based on the following guide: # https://github.com/dlt-hub/dlt/pull/392 diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py b/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py index 4a835e1a..395cbbc8 100644 --- a/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py @@ -45,15 +45,8 @@ class SwitchDestinationStep(Step): async def run(self, sdk: ContinueSDK): # Switch destination from DuckDB to Google BigQuery - filename = 'chess.py' - prompt = 'Replace the "destination" argument with "bigquery"' - - ## edit the pipeline to add a tranform function and attach it to a resource - await sdk.edit_file( - filename=filename, - prompt=prompt, - name=f'Replacing the "destination" argument with "bigquery" {AI_ASSISTED_STRING}' - ) + filepath = os.path.join(sdk.ide.workspace_directory, 'chess.py') + await sdk.run_step(FindAndReplaceStep(filepath=filepath, pattern="destination='duckdb'", replacement="destination='bigquery'")) # Add BigQuery credentials to your secrets.toml file template = dedent(f"""\ @@ -63,16 +56,28 @@ class SwitchDestinationStep(Step): private_key = "private_key" # please set me up! client_email = "client_email" # please set me up!""") - ## wait for user to put API key in secrets.toml - await sdk.ide.setFileOpen(await sdk.ide.getWorkspaceDirectory() + "/.dlt/secrets.toml") - ## append template to bottom of secrets.toml + # wait for user to put API key in secrets.toml + secrets_path = os.path.join( + sdk.ide.workspace_directory, "/.dlt/secrets.toml") + await sdk.ide.setFileOpen(secrets_path) + await sdk.append_to_file(secrets_path, template) + + # append template to bottom of secrets.toml await sdk.wait_for_user_confirmation("Please add your GCP credentials to `secrets.toml` file and then press `Continue`") + +class LoadDataStep(Step): + name: str = "Load data to BigQuery" + hide: bool = True + + async def run(self, sdk: ContinueSDK): # Run the pipeline again to load data to BigQuery output = await sdk.run('env/bin/python3 chess.py', name="Load data to BigQuery", description="Running `env/bin/python3 chess.py` to load data to Google BigQuery") - ## TODO: REPLACE WITH APPROACH TO HELPING WITH THINGS MENTIONED IN `## 5. Troubleshoot exceptions` if "Traceback" in output or "SyntaxError" in output: + with open(os.path.join(__file__, "dlt_duckdb_to_bigquery_docs.md"), "r") as f: + docs = f.read() + suggestion = sdk.models.gpt35.complete(dedent(f"""\ ```python {await sdk.ide.readFile(os.path.join(sdk.ide.workspace_directory, "query.py"))} @@ -83,6 +88,10 @@ class SwitchDestinationStep(Step): {output} ``` + Here is documentation describing common errors and their causes/solutions: + + {docs} + This is a brief summary of the error followed by a suggestion on how it can be fixed:""")) sdk.raise_exception( diff --git a/continuedev/src/continuedev/steps/steps_on_startup.py b/continuedev/src/continuedev/steps/steps_on_startup.py index cd40ff56..ba793425 100644 --- a/continuedev/src/continuedev/steps/steps_on_startup.py +++ b/continuedev/src/continuedev/steps/steps_on_startup.py @@ -1,11 +1,13 @@ from ..core.main import ContinueSDK, Models, Step from .main import UserInputStep from ..recipes.CreatePipelineRecipe.main import CreatePipelineRecipe +from ..recipes.DDtoBQRecipe.main import DDtoBQRecipeRecipe step_name_to_step_class = { "UserInputStep": UserInputStep, - "CreatePipelineRecipe": CreatePipelineRecipe + "CreatePipelineRecipe": CreatePipelineRecipe, + "DDtoBQRecipeRecipe": DDtoBQRecipeRecipe } -- cgit v1.2.3-70-g09d2 From 3c0d26d888164a1e58b83a22c8221dd8a1344828 Mon Sep 17 00:00:00 2001 From: Nate Sesti Date: Sun, 11 Jun 2023 23:25:04 -0700 Subject: finished bq recipe --- .../src/continuedev/recipes/DDtoBQRecipe/main.py | 14 +++++++------- .../src/continuedev/recipes/DDtoBQRecipe/steps.py | 17 +++++++++-------- continuedev/src/continuedev/steps/find_and_replace.py | 3 ++- 3 files changed, 18 insertions(+), 16 deletions(-) (limited to 'continuedev/src') diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py b/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py index 1cb12ff3..cd1ff1b9 100644 --- a/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/main.py @@ -4,24 +4,24 @@ from ...core.main import Step from ...core.sdk import ContinueSDK from ...steps.core.core import WaitForUserInputStep from ...steps.main import MessageStep -from .steps import SetUpChessPipelineStep, SwitchDestinationStep +from .steps import SetUpChessPipelineStep, SwitchDestinationStep, LoadDataStep # Based on the following guide: # https://github.com/dlt-hub/dlt/pull/392 + class DDtoBQRecipeRecipe(Step): hide: bool = True async def run(self, sdk: ContinueSDK): - text_observation = await sdk.run_step( + await sdk.run_step( MessageStep(name="Move from using DuckDB to Google BigQuery as the destination", message=dedent("""\ This recipe will walk you through the process of moving from using DuckDB to Google BigQuery as the destination for your dlt pipeline. With the help of Continue, you will: - Set up a dlt pipeline for the chess.com API - Switch destination from DuckDB to Google BigQuery - Add BigQuery credentials to your secrets.toml file - - Run the pipeline again to load data to BigQuery""")) - ) - await sdk.run_step( + - Run the pipeline again to load data to BigQuery""")) >> SetUpChessPipelineStep() >> - SwitchDestinationStep() - ) \ No newline at end of file + SwitchDestinationStep() >> + LoadDataStep() + ) diff --git a/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py b/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py index 395cbbc8..c7e5d095 100644 --- a/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py +++ b/continuedev/src/continuedev/recipes/DDtoBQRecipe/steps.py @@ -3,6 +3,7 @@ import subprocess from textwrap import dedent import time +from ...steps.find_and_replace import FindAndReplaceStep from ...models.main import Range from ...models.filesystem import RangeInFile from ...steps.main import MessageStep @@ -14,6 +15,7 @@ from ...core.sdk import ContinueSDK AI_ASSISTED_STRING = "(✨ AI-Assisted ✨)" + class SetUpChessPipelineStep(Step): hide: bool = True name: str = "Setup Chess.com API dlt Pipeline" @@ -45,7 +47,8 @@ class SwitchDestinationStep(Step): async def run(self, sdk: ContinueSDK): # Switch destination from DuckDB to Google BigQuery - filepath = os.path.join(sdk.ide.workspace_directory, 'chess.py') + filepath = os.path.join( + sdk.ide.workspace_directory, 'chess_pipeline.py') await sdk.run_step(FindAndReplaceStep(filepath=filepath, pattern="destination='duckdb'", replacement="destination='bigquery'")) # Add BigQuery credentials to your secrets.toml file @@ -58,7 +61,7 @@ class SwitchDestinationStep(Step): # wait for user to put API key in secrets.toml secrets_path = os.path.join( - sdk.ide.workspace_directory, "/.dlt/secrets.toml") + sdk.ide.workspace_directory, ".dlt/secrets.toml") await sdk.ide.setFileOpen(secrets_path) await sdk.append_to_file(secrets_path, template) @@ -72,17 +75,15 @@ class LoadDataStep(Step): async def run(self, sdk: ContinueSDK): # Run the pipeline again to load data to BigQuery - output = await sdk.run('env/bin/python3 chess.py', name="Load data to BigQuery", description="Running `env/bin/python3 chess.py` to load data to Google BigQuery") + output = await sdk.run('env/bin/python3 chess_pipeline.py', name="Load data to BigQuery", description="Running `env/bin/python3 chess_pipeline.py` to load data to Google BigQuery") if "Traceback" in output or "SyntaxError" in output: - with open(os.path.join(__file__, "dlt_duckdb_to_bigquery_docs.md"), "r") as f: + with open(os.path.join(os.path.dirname(__file__), "dlt_duckdb_to_bigquery_docs.md"), "r") as f: docs = f.read() + output = "Traceback" + output.split("Traceback")[-1] suggestion = sdk.models.gpt35.complete(dedent(f"""\ - ```python - {await sdk.ide.readFile(os.path.join(sdk.ide.workspace_directory, "query.py"))} - ``` - This above code is a query that runs on the DuckDB instance. While attempting to run the query, the following error occurred: + When trying to load data into BigQuery, the following error occurred: ```ascii {output} diff --git a/continuedev/src/continuedev/steps/find_and_replace.py b/continuedev/src/continuedev/steps/find_and_replace.py index 78511b27..c9654867 100644 --- a/continuedev/src/continuedev/steps/find_and_replace.py +++ b/continuedev/src/continuedev/steps/find_and_replace.py @@ -19,7 +19,8 @@ class FindAndReplaceStep(Step): end_index = start_index + len(self.pattern) await sdk.ide.applyFileSystemEdit(FileEdit( filepath=self.filepath, - range=Range.from_indices(file_content, start_index, end_index), + range=Range.from_indices( + file_content, start_index, end_index - 1), replacement=self.replacement )) file_content = file_content[:start_index] + \ -- cgit v1.2.3-70-g09d2