diff options
Diffstat (limited to 'lib/python2.7/htmllib.py')
-rw-r--r-- | lib/python2.7/htmllib.py | 491 |
1 files changed, 491 insertions, 0 deletions
diff --git a/lib/python2.7/htmllib.py b/lib/python2.7/htmllib.py new file mode 100644 index 0000000..44647db --- /dev/null +++ b/lib/python2.7/htmllib.py @@ -0,0 +1,491 @@ +"""HTML 2.0 parser. + +See the HTML 2.0 specification: +http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html +""" + +from warnings import warnpy3k +warnpy3k("the htmllib module has been removed in Python 3.0", + stacklevel=2) +del warnpy3k + +import sgmllib + +from formatter import AS_IS + +__all__ = ["HTMLParser", "HTMLParseError"] + + +class HTMLParseError(sgmllib.SGMLParseError): + """Error raised when an HTML document can't be parsed.""" + + +class HTMLParser(sgmllib.SGMLParser): + """This is the basic HTML parser class. + + It supports all entity names required by the XHTML 1.0 Recommendation. + It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 + elements. + + """ + + from htmlentitydefs import entitydefs + + def __init__(self, formatter, verbose=0): + """Creates an instance of the HTMLParser class. + + The formatter parameter is the formatter instance associated with + the parser. + + """ + sgmllib.SGMLParser.__init__(self, verbose) + self.formatter = formatter + + def error(self, message): + raise HTMLParseError(message) + + def reset(self): + sgmllib.SGMLParser.reset(self) + self.savedata = None + self.isindex = 0 + self.title = None + self.base = None + self.anchor = None + self.anchorlist = [] + self.nofill = 0 + self.list_stack = [] + + # ------ Methods used internally; some may be overridden + + # --- Formatter interface, taking care of 'savedata' mode; + # shouldn't need to be overridden + + def handle_data(self, data): + if self.savedata is not None: + self.savedata = self.savedata + data + else: + if self.nofill: + self.formatter.add_literal_data(data) + else: + self.formatter.add_flowing_data(data) + + # --- Hooks to save data; shouldn't need to be overridden + + def save_bgn(self): + """Begins saving character data in a buffer instead of sending it + to the formatter object. + + Retrieve the stored data via the save_end() method. Use of the + save_bgn() / save_end() pair may not be nested. + + """ + self.savedata = '' + + def save_end(self): + """Ends buffering character data and returns all data saved since + the preceding call to the save_bgn() method. + + If the nofill flag is false, whitespace is collapsed to single + spaces. A call to this method without a preceding call to the + save_bgn() method will raise a TypeError exception. + + """ + data = self.savedata + self.savedata = None + if not self.nofill: + data = ' '.join(data.split()) + return data + + # --- Hooks for anchors; should probably be overridden + + def anchor_bgn(self, href, name, type): + """This method is called at the start of an anchor region. + + The arguments correspond to the attributes of the <A> tag with + the same names. The default implementation maintains a list of + hyperlinks (defined by the HREF attribute for <A> tags) within + the document. The list of hyperlinks is available as the data + attribute anchorlist. + + """ + self.anchor = href + if self.anchor: + self.anchorlist.append(href) + + def anchor_end(self): + """This method is called at the end of an anchor region. + + The default implementation adds a textual footnote marker using an + index into the list of hyperlinks created by the anchor_bgn()method. + + """ + if self.anchor: + self.handle_data("[%d]" % len(self.anchorlist)) + self.anchor = None + + # --- Hook for images; should probably be overridden + + def handle_image(self, src, alt, *args): + """This method is called to handle images. + + The default implementation simply passes the alt value to the + handle_data() method. + + """ + self.handle_data(alt) + + # --------- Top level elememts + + def start_html(self, attrs): pass + def end_html(self): pass + + def start_head(self, attrs): pass + def end_head(self): pass + + def start_body(self, attrs): pass + def end_body(self): pass + + # ------ Head elements + + def start_title(self, attrs): + self.save_bgn() + + def end_title(self): + self.title = self.save_end() + + def do_base(self, attrs): + for a, v in attrs: + if a == 'href': + self.base = v + + def do_isindex(self, attrs): + self.isindex = 1 + + def do_link(self, attrs): + pass + + def do_meta(self, attrs): + pass + + def do_nextid(self, attrs): # Deprecated + pass + + # ------ Body elements + + # --- Headings + + def start_h1(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_font(('h1', 0, 1, 0)) + + def end_h1(self): + self.formatter.end_paragraph(1) + self.formatter.pop_font() + + def start_h2(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_font(('h2', 0, 1, 0)) + + def end_h2(self): + self.formatter.end_paragraph(1) + self.formatter.pop_font() + + def start_h3(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_font(('h3', 0, 1, 0)) + + def end_h3(self): + self.formatter.end_paragraph(1) + self.formatter.pop_font() + + def start_h4(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_font(('h4', 0, 1, 0)) + + def end_h4(self): + self.formatter.end_paragraph(1) + self.formatter.pop_font() + + def start_h5(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_font(('h5', 0, 1, 0)) + + def end_h5(self): + self.formatter.end_paragraph(1) + self.formatter.pop_font() + + def start_h6(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_font(('h6', 0, 1, 0)) + + def end_h6(self): + self.formatter.end_paragraph(1) + self.formatter.pop_font() + + # --- Block Structuring Elements + + def do_p(self, attrs): + self.formatter.end_paragraph(1) + + def start_pre(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) + self.nofill = self.nofill + 1 + + def end_pre(self): + self.formatter.end_paragraph(1) + self.formatter.pop_font() + self.nofill = max(0, self.nofill - 1) + + def start_xmp(self, attrs): + self.start_pre(attrs) + self.setliteral('xmp') # Tell SGML parser + + def end_xmp(self): + self.end_pre() + + def start_listing(self, attrs): + self.start_pre(attrs) + self.setliteral('listing') # Tell SGML parser + + def end_listing(self): + self.end_pre() + + def start_address(self, attrs): + self.formatter.end_paragraph(0) + self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) + + def end_address(self): + self.formatter.end_paragraph(0) + self.formatter.pop_font() + + def start_blockquote(self, attrs): + self.formatter.end_paragraph(1) + self.formatter.push_margin('blockquote') + + def end_blockquote(self): + self.formatter.end_paragraph(1) + self.formatter.pop_margin() + + # --- List Elements + + def start_ul(self, attrs): + self.formatter.end_paragraph(not self.list_stack) + self.formatter.push_margin('ul') + self.list_stack.append(['ul', '*', 0]) + + def end_ul(self): + if self.list_stack: del self.list_stack[-1] + self.formatter.end_paragraph(not self.list_stack) + self.formatter.pop_margin() + + def do_li(self, attrs): + self.formatter.end_paragraph(0) + if self.list_stack: + [dummy, label, counter] = top = self.list_stack[-1] + top[2] = counter = counter+1 + else: + label, counter = '*', 0 + self.formatter.add_label_data(label, counter) + + def start_ol(self, attrs): + self.formatter.end_paragraph(not self.list_stack) + self.formatter.push_margin('ol') + label = '1.' + for a, v in attrs: + if a == 'type': + if len(v) == 1: v = v + '.' + label = v + self.list_stack.append(['ol', label, 0]) + + def end_ol(self): + if self.list_stack: del self.list_stack[-1] + self.formatter.end_paragraph(not self.list_stack) + self.formatter.pop_margin() + + def start_menu(self, attrs): + self.start_ul(attrs) + + def end_menu(self): + self.end_ul() + + def start_dir(self, attrs): + self.start_ul(attrs) + + def end_dir(self): + self.end_ul() + + def start_dl(self, attrs): + self.formatter.end_paragraph(1) + self.list_stack.append(['dl', '', 0]) + + def end_dl(self): + self.ddpop(1) + if self.list_stack: del self.list_stack[-1] + + def do_dt(self, attrs): + self.ddpop() + + def do_dd(self, attrs): + self.ddpop() + self.formatter.push_margin('dd') + self.list_stack.append(['dd', '', 0]) + + def ddpop(self, bl=0): + self.formatter.end_paragraph(bl) + if self.list_stack: + if self.list_stack[-1][0] == 'dd': + del self.list_stack[-1] + self.formatter.pop_margin() + + # --- Phrase Markup + + # Idiomatic Elements + + def start_cite(self, attrs): self.start_i(attrs) + def end_cite(self): self.end_i() + + def start_code(self, attrs): self.start_tt(attrs) + def end_code(self): self.end_tt() + + def start_em(self, attrs): self.start_i(attrs) + def end_em(self): self.end_i() + + def start_kbd(self, attrs): self.start_tt(attrs) + def end_kbd(self): self.end_tt() + + def start_samp(self, attrs): self.start_tt(attrs) + def end_samp(self): self.end_tt() + + def start_strong(self, attrs): self.start_b(attrs) + def end_strong(self): self.end_b() + + def start_var(self, attrs): self.start_i(attrs) + def end_var(self): self.end_i() + + # Typographic Elements + + def start_i(self, attrs): + self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) + def end_i(self): + self.formatter.pop_font() + + def start_b(self, attrs): + self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) + def end_b(self): + self.formatter.pop_font() + + def start_tt(self, attrs): + self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) + def end_tt(self): + self.formatter.pop_font() + + def start_a(self, attrs): + href = '' + name = '' + type = '' + for attrname, value in attrs: + value = value.strip() + if attrname == 'href': + href = value + if attrname == 'name': + name = value + if attrname == 'type': + type = value.lower() + self.anchor_bgn(href, name, type) + + def end_a(self): + self.anchor_end() + + # --- Line Break + + def do_br(self, attrs): + self.formatter.add_line_break() + + # --- Horizontal Rule + + def do_hr(self, attrs): + self.formatter.add_hor_rule() + + # --- Image + + def do_img(self, attrs): + align = '' + alt = '(image)' + ismap = '' + src = '' + width = 0 + height = 0 + for attrname, value in attrs: + if attrname == 'align': + align = value + if attrname == 'alt': + alt = value + if attrname == 'ismap': + ismap = value + if attrname == 'src': + src = value + if attrname == 'width': + try: width = int(value) + except ValueError: pass + if attrname == 'height': + try: height = int(value) + except ValueError: pass + self.handle_image(src, alt, ismap, align, width, height) + + # --- Really Old Unofficial Deprecated Stuff + + def do_plaintext(self, attrs): + self.start_pre(attrs) + self.setnomoretags() # Tell SGML parser + + # --- Unhandled tags + + def unknown_starttag(self, tag, attrs): + pass + + def unknown_endtag(self, tag): + pass + + +def test(args = None): + import sys, formatter + + if not args: + args = sys.argv[1:] + + silent = args and args[0] == '-s' + if silent: + del args[0] + + if args: + file = args[0] + else: + file = 'test.html' + + if file == '-': + f = sys.stdin + else: + try: + f = open(file, 'r') + except IOError, msg: + print file, ":", msg + sys.exit(1) + + data = f.read() + + if f is not sys.stdin: + f.close() + + if silent: + f = formatter.NullFormatter() + else: + f = formatter.AbstractFormatter(formatter.DumbWriter()) + + p = HTMLParser(f) + p.feed(data) + p.close() + + +if __name__ == '__main__': + test() |