1 files changed, 98 insertions, 0 deletions
diff --git a/test-chill/testchill/_extract.py b/test-chill/testchill/_extract.py
new file mode 100644
index 0000000..f6984ac
--- /dev/null
+++ b/test-chill/testchill/_extract.py
@@ -0,0 +1,98 @@
+import collections
+import os
+import os.path
+import itertools
+import re
+
+from . import util
+
+if util.python_version_major == 2:
+    from HTMLParser import HTMLParser
+else:
+    from html.parser import HTMLParser
+
+class _TagExtractor(HTMLParser):
+    _comment_style_expr = {
+            'c':      [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
+            'cc':     [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
+            'cpp':    [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
+            'h':      [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
+            'hh':     [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
+            'hpp':    [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
+            'py':     [('#+',r'[\n]'),('\'\'\'',),('"""',)],
+            'script': [('#+',r'[\n]')],
+            'lua':    [(r'--\[\[',r'\]\]--')]
+        }
+    
+    def __init__(self, tagname):
+        HTMLParser.__init__(self)
+        self.tagname = tagname
+        self._readin = False
+        self._value = ''
+    
+    def handle_starttag(self, tag, attrs):
+        if tag == self.tagname:
+            self._readin = True
+            self._attrs = dict(attrs)
+    
+    def handle_endtag(self, tag):
+        if tag == self.tagname:
+            self._readin = False
+            self._tag_list.append((self._value, self._attrs))
+            self._value = ''
+    
+    def handle_data(self, txt):
+        if self._readin:
+            self._value += txt
+    
+    @classmethod
+    def _parse(cls, tagname, txt):
+        reader = cls(tagname)
+        reader._readin = False
+        reader._value = ''
+        reader._tag_list = []
+        reader.feed(txt)
+        return reader._tag_list
+    
+    @classmethod
+    def _get_commentstyles(cls, ext):
+        for comment_style in cls._comment_style_expr[ext]:
+            if len(comment_style) == 1:
+                start_expr = comment_style[0]
+                end_expr = comment_style[0]
+            elif len(comment_style) == 2:
+                start_expr = comment_style[0]
+                end_expr = comment_style[1]
+            yield start_expr, end_expr
+    
+    @classmethod
+    def _commented(cls, txt, ext):
+        comment_spans = list()
+        for start_expr, end_expr in cls._get_commentstyles(ext):
+            pos = 0
+            while pos < len(txt):
+                start_match = re.search(start_expr, txt[pos:])
+                if start_match:
+                    start_pos = pos + start_match.end()
+                    end_match = re.search(end_expr, txt[start_pos:])
+                    if end_match:
+                        end_pos = start_pos + end_match.start()
+                        pos = start_pos + end_match.end()
+                    else:
+                        end_pos = len(txt)
+                        pos = end_pos
+                    comment_spans.append((start_pos, end_pos))
+                else:
+                    break
+        for span in sorted(comment_spans, key=lambda s: s[0]):
+            yield txt[span[0]:span[1]]
+    
+    @classmethod
+    def extract_tag(cls, tagname, filename, wd=os.getcwd()):
+        with open(os.path.join(wd, filename), 'r') as f:
+            txt = f.read()
+        ext = filename.split('.')[-1]
+        return cls._parse(tagname, '\n'.join(cls._commented(txt, ext)))
+
+extract_tag = _TagExtractor.extract_tag
+