1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
import collections
import os
import os.path
import itertools
import re
from . import util
if util.python_version_major == 2:
from HTMLParser import HTMLParser
else:
from html.parser import HTMLParser
class _TagExtractor(HTMLParser):
_comment_style_expr = {
'c': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
'cc': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
'cpp': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
'h': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
'hh': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
'hpp': [('/(/)+',r'[\n]'),(r'/\*',r'\*/')],
'py': [('#+',r'[\n]'),('\'\'\'',),('"""',)],
'script': [('#+',r'[\n]')],
'lua': [(r'--\[\[',r'\]\]--')]
}
def __init__(self, tagname):
HTMLParser.__init__(self)
self.tagname = tagname
self._readin = False
self._value = ''
def handle_starttag(self, tag, attrs):
if tag == self.tagname:
self._readin = True
self._attrs = dict(attrs)
def handle_endtag(self, tag):
if tag == self.tagname:
self._readin = False
self._tag_list.append((self._value, self._attrs))
self._value = ''
def handle_data(self, txt):
if self._readin:
self._value += txt
@classmethod
def _parse(cls, tagname, txt):
reader = cls(tagname)
reader._readin = False
reader._value = ''
reader._tag_list = []
reader.feed(txt)
return reader._tag_list
@classmethod
def _get_commentstyles(cls, ext):
for comment_style in cls._comment_style_expr[ext]:
if len(comment_style) == 1:
start_expr = comment_style[0]
end_expr = comment_style[0]
elif len(comment_style) == 2:
start_expr = comment_style[0]
end_expr = comment_style[1]
yield start_expr, end_expr
@classmethod
def _commented(cls, txt, ext):
comment_spans = list()
for start_expr, end_expr in cls._get_commentstyles(ext):
pos = 0
while pos < len(txt):
start_match = re.search(start_expr, txt[pos:])
if start_match:
start_pos = pos + start_match.end()
end_match = re.search(end_expr, txt[start_pos:])
if end_match:
end_pos = start_pos + end_match.start()
pos = start_pos + end_match.end()
else:
end_pos = len(txt)
pos = end_pos
comment_spans.append((start_pos, end_pos))
else:
break
for span in sorted(comment_spans, key=lambda s: s[0]):
yield txt[span[0]:span[1]]
@classmethod
def extract_tag(cls, tagname, filename, wd=os.getcwd()):
with open(os.path.join(wd, filename), 'r') as f:
txt = f.read()
ext = filename.split('.')[-1]
return cls._parse(tagname, '\n'.join(cls._commented(txt, ext)))
extract_tag = _TagExtractor.extract_tag
|