diff options
Diffstat (limited to 'test-chill/testchill/_extract.py')
-rw-r--r-- | test-chill/testchill/_extract.py | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/test-chill/testchill/_extract.py b/test-chill/testchill/_extract.py new file mode 100644 index 0000000..f6984ac --- /dev/null +++ b/test-chill/testchill/_extract.py @@ -0,0 +1,98 @@ +import collections +import os +import os.path +import itertools +import re + +from . import util + +if util.python_version_major == 2: + from HTMLParser import HTMLParser +else: + from html.parser import HTMLParser + +class _TagExtractor(HTMLParser): + _comment_style_expr = { + 'c': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')], + 'cc': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')], + 'cpp': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')], + 'h': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')], + 'hh': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')], + 'hpp': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')], + 'py': [('#+',r'[\n]'),('\'\'\'',),('"""',)], + 'script': [('#+',r'[\n]')], + 'lua': [(r'--\[\[',r'\]\]--')] + } + + def __init__(self, tagname): + HTMLParser.__init__(self) + self.tagname = tagname + self._readin = False + self._value = '' + + def handle_starttag(self, tag, attrs): + if tag == self.tagname: + self._readin = True + self._attrs = dict(attrs) + + def handle_endtag(self, tag): + if tag == self.tagname: + self._readin = False + self._tag_list.append((self._value, self._attrs)) + self._value = '' + + def handle_data(self, txt): + if self._readin: + self._value += txt + + @classmethod + def _parse(cls, tagname, txt): + reader = cls(tagname) + reader._readin = False + reader._value = '' + reader._tag_list = [] + reader.feed(txt) + return reader._tag_list + + @classmethod + def _get_commentstyles(cls, ext): + for comment_style in cls._comment_style_expr[ext]: + if len(comment_style) == 1: + start_expr = comment_style[0] + end_expr = comment_style[0] + elif len(comment_style) == 2: + start_expr = comment_style[0] + end_expr = comment_style[1] + yield start_expr, end_expr + + @classmethod + def _commented(cls, txt, ext): + comment_spans = list() + for start_expr, end_expr in cls._get_commentstyles(ext): + pos = 0 + while pos < len(txt): + start_match = re.search(start_expr, txt[pos:]) + if start_match: + start_pos = pos + start_match.end() + end_match = re.search(end_expr, txt[start_pos:]) + if end_match: + end_pos = start_pos + end_match.start() + pos = start_pos + end_match.end() + else: + end_pos = len(txt) + pos = end_pos + comment_spans.append((start_pos, end_pos)) + else: + break + for span in sorted(comment_spans, key=lambda s: s[0]): + yield txt[span[0]:span[1]] + + @classmethod + def extract_tag(cls, tagname, filename, wd=os.getcwd()): + with open(os.path.join(wd, filename), 'r') as f: + txt = f.read() + ext = filename.split('.')[-1] + return cls._parse(tagname, '\n'.join(cls._commented(txt, ext))) + +extract_tag = _TagExtractor.extract_tag + |