summaryrefslogtreecommitdiff
path: root/continuedev/src
diff options
context:
space:
mode:
authorNate Sesti <sestinj@gmail.com>2023-07-21 19:51:23 -0700
committerNate Sesti <sestinj@gmail.com>2023-07-21 19:51:23 -0700
commita87e66758731a9e76c9c394dc2190b9882ddbceb (patch)
treed78cea0923aa354eba8e040f9e5e69c0d179e7c3 /continuedev/src
parent49bde0769d02b626df507c58366f763f0fdfc05e (diff)
downloadsncontinue-a87e66758731a9e76c9c394dc2190b9882ddbceb.tar.gz
sncontinue-a87e66758731a9e76c9c394dc2190b9882ddbceb.tar.bz2
sncontinue-a87e66758731a9e76c9c394dc2190b9882ddbceb.zip
clean pii from telemetry
Diffstat (limited to 'continuedev/src')
-rw-r--r--continuedev/src/continuedev/libs/util/commonregex.py138
-rw-r--r--continuedev/src/continuedev/libs/util/telemetry.py7
2 files changed, 144 insertions, 1 deletions
diff --git a/continuedev/src/continuedev/libs/util/commonregex.py b/continuedev/src/continuedev/libs/util/commonregex.py
new file mode 100644
index 00000000..55da7fc0
--- /dev/null
+++ b/continuedev/src/continuedev/libs/util/commonregex.py
@@ -0,0 +1,138 @@
+# coding: utf-8
+import json
+import re
+from typing import Any, Dict
+
+date = re.compile(
+ '(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?)(?:\,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}', re.IGNORECASE)
+time = re.compile(
+ '\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?', re.IGNORECASE)
+phone = re.compile(
+ '''((?:(?<![\d-])(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4}(?![\d-]))|(?:(?<![\d-])(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4}(?![\d-])))''')
+phones_with_exts = re.compile(
+ '((?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?(?:[2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?(?:[0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(?:\d+)?))', re.IGNORECASE)
+link = re.compile('(?i)((?:https?://|www\d{0,3}[.])?[a-z0-9.\-]+[.](?:(?:international)|(?:construction)|(?:contractors)|(?:enterprises)|(?:photography)|(?:immobilien)|(?:management)|(?:technology)|(?:directory)|(?:education)|(?:equipment)|(?:institute)|(?:marketing)|(?:solutions)|(?:builders)|(?:clothing)|(?:computer)|(?:democrat)|(?:diamonds)|(?:graphics)|(?:holdings)|(?:lighting)|(?:plumbing)|(?:training)|(?:ventures)|(?:academy)|(?:careers)|(?:company)|(?:domains)|(?:florist)|(?:gallery)|(?:guitars)|(?:holiday)|(?:kitchen)|(?:recipes)|(?:shiksha)|(?:singles)|(?:support)|(?:systems)|(?:agency)|(?:berlin)|(?:camera)|(?:center)|(?:coffee)|(?:estate)|(?:kaufen)|(?:luxury)|(?:monash)|(?:museum)|(?:photos)|(?:repair)|(?:social)|(?:tattoo)|(?:travel)|(?:viajes)|(?:voyage)|(?:build)|(?:cheap)|(?:codes)|(?:dance)|(?:email)|(?:glass)|(?:house)|(?:ninja)|(?:photo)|(?:shoes)|(?:solar)|(?:today)|(?:aero)|(?:arpa)|(?:asia)|(?:bike)|(?:buzz)|(?:camp)|(?:club)|(?:coop)|(?:farm)|(?:gift)|(?:guru)|(?:info)|(?:jobs)|(?:kiwi)|(?:land)|(?:limo)|(?:link)|(?:menu)|(?:mobi)|(?:moda)|(?:name)|(?:pics)|(?:pink)|(?:post)|(?:rich)|(?:ruhr)|(?:sexy)|(?:tips)|(?:wang)|(?:wien)|(?:zone)|(?:biz)|(?:cab)|(?:cat)|(?:ceo)|(?:com)|(?:edu)|(?:gov)|(?:int)|(?:mil)|(?:net)|(?:onl)|(?:org)|(?:pro)|(?:red)|(?:tel)|(?:uno)|(?:xxx)|(?:ac)|(?:ad)|(?:ae)|(?:af)|(?:ag)|(?:ai)|(?:al)|(?:am)|(?:an)|(?:ao)|(?:aq)|(?:ar)|(?:as)|(?:at)|(?:au)|(?:aw)|(?:ax)|(?:az)|(?:ba)|(?:bb)|(?:bd)|(?:be)|(?:bf)|(?:bg)|(?:bh)|(?:bi)|(?:bj)|(?:bm)|(?:bn)|(?:bo)|(?:br)|(?:bs)|(?:bt)|(?:bv)|(?:bw)|(?:by)|(?:bz)|(?:ca)|(?:cc)|(?:cd)|(?:cf)|(?:cg)|(?:ch)|(?:ci)|(?:ck)|(?:cl)|(?:cm)|(?:cn)|(?:co)|(?:cr)|(?:cu)|(?:cv)|(?:cw)|(?:cx)|(?:cy)|(?:cz)|(?:de)|(?:dj)|(?:dk)|(?:dm)|(?:do)|(?:dz)|(?:ec)|(?:ee)|(?:eg)|(?:er)|(?:es)|(?:et)|(?:eu)|(?:fi)|(?:fj)|(?:fk)|(?:fm)|(?:fo)|(?:fr)|(?:ga)|(?:gb)|(?:gd)|(?:ge)|(?:gf)|(?:gg)|(?:gh)|(?:gi)|(?:gl)|(?:gm)|(?:gn)|(?:gp)|(?:gq)|(?:gr)|(?:gs)|(?:gt)|(?:gu)|(?:gw)|(?:gy)|(?:hk)|(?:hm)|(?:hn)|(?:hr)|(?:ht)|(?:hu)|(?:id)|(?:ie)|(?:il)|(?:im)|(?:in)|(?:io)|(?:iq)|(?:ir)|(?:is)|(?:it)|(?:je)|(?:jm)|(?:jo)|(?:jp)|(?:ke)|(?:kg)|(?:kh)|(?:ki)|(?:km)|(?:kn)|(?:kp)|(?:kr)|(?:kw)|(?:ky)|(?:kz)|(?:la)|(?:lb)|(?:lc)|(?:li)|(?:lk)|(?:lr)|(?:ls)|(?:lt)|(?:lu)|(?:lv)|(?:ly)|(?:ma)|(?:mc)|(?:md)|(?:me)|(?:mg)|(?:mh)|(?:mk)|(?:ml)|(?:mm)|(?:mn)|(?:mo)|(?:mp)|(?:mq)|(?:mr)|(?:ms)|(?:mt)|(?:mu)|(?:mv)|(?:mw)|(?:mx)|(?:my)|(?:mz)|(?:na)|(?:nc)|(?:ne)|(?:nf)|(?:ng)|(?:ni)|(?:nl)|(?:no)|(?:np)|(?:nr)|(?:nu)|(?:nz)|(?:om)|(?:pa)|(?:pe)|(?:pf)|(?:pg)|(?:ph)|(?:pk)|(?:pl)|(?:pm)|(?:pn)|(?:pr)|(?:ps)|(?:pt)|(?:pw)|(?:py)|(?:qa)|(?:re)|(?:ro)|(?:rs)|(?:ru)|(?:rw)|(?:sa)|(?:sb)|(?:sc)|(?:sd)|(?:se)|(?:sg)|(?:sh)|(?:si)|(?:sj)|(?:sk)|(?:sl)|(?:sm)|(?:sn)|(?:so)|(?:sr)|(?:st)|(?:su)|(?:sv)|(?:sx)|(?:sy)|(?:sz)|(?:tc)|(?:td)|(?:tf)|(?:tg)|(?:th)|(?:tj)|(?:tk)|(?:tl)|(?:tm)|(?:tn)|(?:to)|(?:tp)|(?:tr)|(?:tt)|(?:tv)|(?:tw)|(?:tz)|(?:ua)|(?:ug)|(?:uk)|(?:us)|(?:uy)|(?:uz)|(?:va)|(?:vc)|(?:ve)|(?:vg)|(?:vi)|(?:vn)|(?:vu)|(?:wf)|(?:ws)|(?:ye)|(?:yt)|(?:za)|(?:zm)|(?:zw))(?:/[^\s()<>]+[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019])?)', re.IGNORECASE)
+email = re.compile(
+ "([a-z0-9!#$%&'*+\/=?^_`{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)", re.IGNORECASE)
+ip = re.compile('(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', re.IGNORECASE)
+ipv6 = re.compile(
+ '\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*', re.VERBOSE | re.IGNORECASE | re.DOTALL)
+price = re.compile('[$]\s?[+-]?[0-9]{1,3}(?:(?:,?[0-9]{3}))*(?:\.[0-9]{1,2})?')
+hex_color = re.compile('(#(?:[0-9a-fA-F]{8})|#(?:[0-9a-fA-F]{3}){1,2})\\b')
+credit_card = re.compile('((?:(?:\\d{4}[- ]?){3}\\d{4}|\\d{15,16}))(?![\\d])')
+btc_address = re.compile(
+ '(?<![a-km-zA-HJ-NP-Z0-9])[13][a-km-zA-HJ-NP-Z0-9]{26,33}(?![a-km-zA-HJ-NP-Z0-9])')
+street_address = re.compile(
+ '\d{1,4} [\w\s]{1,20}(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\W?(?=\s|$)', re.IGNORECASE)
+zip_code = re.compile(r'\b\d{5}(?:[-\s]\d{4})?\b')
+po_box = re.compile(r'P\.? ?O\.? Box \d+', re.IGNORECASE)
+ssn = re.compile(
+ '(?!000|666|333)0*(?:[0-6][0-9][0-9]|[0-7][0-6][0-9]|[0-7][0-7][0-2])[- ](?!00)[0-9]{2}[- ](?!0000)[0-9]{4}')
+win_absolute_filepath = re.compile(
+ r'^(?:[a-zA-Z]\:|\\\\[\w\.]+\\[\w.$]+)\\(?:[\w]+\\)*\w([\w.])+', re.IGNORECASE)
+unix_absolute_filepath = re.compile(
+ r'^\/(?:[\/\w]+\/)*\w([\w.])+', re.IGNORECASE)
+
+regexes = {
+ "win_absolute_filepath": win_absolute_filepath,
+ "unix_absolute_filepath": unix_absolute_filepath,
+ "dates": date,
+ "times": time,
+ "phones": phone,
+ "phones_with_exts": phones_with_exts,
+ "links": link,
+ "emails": email,
+ "ips": ip,
+ "ipv6s": ipv6,
+ "prices": price,
+ "hex_colors": hex_color,
+ "credit_cards": credit_card,
+ "btc_addresses": btc_address,
+ "street_addresses": street_address,
+ "zip_codes": zip_code,
+ "po_boxes": po_box,
+ "ssn_number": ssn,
+}
+
+placeholders = {
+ "win_absolute_filepath": "<FILEPATH>",
+ "unix_absolute_filepath": "<FILEPATH>",
+ "dates": "<DATE>",
+ "times": "<TIME>",
+ "phones": "<PHONE>",
+ "phones_with_exts": "<PHONE_WITH_EXT>",
+ "links": "<LINK>",
+ "emails": "<EMAIL>",
+ "ips": "<IP>",
+ "ipv6s": "<IPV6>",
+ "prices": "<PRICE>",
+ "hex_colors": "<HEX_COLOR>",
+ "credit_cards": "<CREDIT_CARD>",
+ "btc_addresses": "<BTC_ADDRESS>",
+ "street_addresses": "<STREET_ADDRESS>",
+ "zip_codes": "<ZIP_CODE>",
+ "po_boxes": "<PO_BOX>",
+ "ssn_number": "<SSN>",
+}
+
+
+class regex:
+
+ def __init__(self, obj, regex):
+ self.obj = obj
+ self.regex = regex
+
+ def __call__(self, *args):
+ def regex_method(text=None):
+ return [x.strip() for x in self.regex.findall(text or self.obj.text)]
+ return regex_method
+
+
+class CommonRegex(object):
+
+ def __init__(self, text=""):
+ self.text = text
+
+ for k, v in list(regexes.items()):
+ setattr(self, k, regex(self, v)(self))
+
+ if text:
+ for key in list(regexes.keys()):
+ method = getattr(self, key)
+ setattr(self, key, method())
+
+
+pii_parser = CommonRegex()
+
+
+def clean_pii_from_str(text: str):
+ """Replace personally identifiable information (PII) with placeholders."""
+ for regex_name, regex in list(regexes.items()):
+ placeholder = placeholders[regex_name]
+ text = regex.sub(placeholder, text)
+
+ return text
+
+
+def clean_pii_from_any(v: Any) -> Any:
+ """Replace personally identifiable information (PII) with placeholders. Not guaranteed to return same type as input."""
+ if isinstance(v, str):
+ return clean_pii_from_str(v)
+ elif isinstance(v, dict):
+ cleaned_dict = {}
+ for key, value in v.items():
+ cleaned_dict[key] = clean_pii_from_any(value)
+ return cleaned_dict
+ elif isinstance(v, list):
+ return [clean_pii_from_any(x) for x in v]
+ else:
+ # Try to convert to string
+ try:
+ orig_text = str(v)
+ cleaned_text = clean_pii_from_str(orig_text)
+ if orig_text != cleaned_text:
+ return cleaned_text
+ else:
+ return v
+ except:
+ return v
diff --git a/continuedev/src/continuedev/libs/util/telemetry.py b/continuedev/src/continuedev/libs/util/telemetry.py
index bd9fde9d..17735dce 100644
--- a/continuedev/src/continuedev/libs/util/telemetry.py
+++ b/continuedev/src/continuedev/libs/util/telemetry.py
@@ -3,6 +3,7 @@ from posthog import Posthog
from ...core.config import load_config
import os
from dotenv import load_dotenv
+from .commonregex import clean_pii_from_any
load_dotenv()
in_codespaces = os.getenv("CODESPACES") == "true"
@@ -13,10 +14,14 @@ posthog = Posthog('phc_JS6XFROuNbhJtVCEdTSYk6gl5ArRrTNMpCcguAXlSPs',
def capture_event(unique_id: str, event_name: str, event_properties: Any):
+ # Return early if telemetry is disabled
config = load_config('.continue/config.json')
if not config.allow_anonymous_telemetry:
return
if in_codespaces:
event_properties['codespaces'] = True
- posthog.capture(unique_id, event_name, event_properties)
+
+ # Send event to PostHog
+ posthog.capture(unique_id, event_name,
+ clean_pii_from_any(event_properties))