diff options
Diffstat (limited to 'venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py')
-rw-r--r-- | venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py | 2791 |
1 files changed, 0 insertions, 2791 deletions
diff --git a/venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py b/venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py deleted file mode 100644 index ae41a13..0000000 --- a/venv/Lib/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py +++ /dev/null @@ -1,2791 +0,0 @@ -from __future__ import absolute_import, division, unicode_literals -from pip._vendor.six import with_metaclass, viewkeys - -import types -from collections import OrderedDict - -from . import _inputstream -from . import _tokenizer - -from . import treebuilders -from .treebuilders.base import Marker - -from . import _utils -from .constants import ( - spaceCharacters, asciiUpper2Lower, - specialElements, headingElements, cdataElements, rcdataElements, - tokenTypes, tagTokenTypes, - namespaces, - htmlIntegrationPointElements, mathmlTextIntegrationPointElements, - adjustForeignAttributes as adjustForeignAttributesMap, - adjustMathMLAttributes, adjustSVGAttributes, - E, - _ReparseException -) - - -def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): - """Parse an HTML document as a string or file-like object into a tree - - :arg doc: the document to parse as a string or file-like object - - :arg treebuilder: the treebuilder to use when parsing - - :arg namespaceHTMLElements: whether or not to namespace HTML elements - - :returns: parsed tree - - Example: - - >>> from html5lib.html5parser import parse - >>> parse('<html><body><p>This is a doc</p></body></html>') - <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> - - """ - tb = treebuilders.getTreeBuilder(treebuilder) - p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) - return p.parse(doc, **kwargs) - - -def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): - """Parse an HTML fragment as a string or file-like object into a tree - - :arg doc: the fragment to parse as a string or file-like object - - :arg container: the container context to parse the fragment in - - :arg treebuilder: the treebuilder to use when parsing - - :arg namespaceHTMLElements: whether or not to namespace HTML elements - - :returns: parsed tree - - Example: - - >>> from html5lib.html5libparser import parseFragment - >>> parseFragment('<b>this is a fragment</b>') - <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> - - """ - tb = treebuilders.getTreeBuilder(treebuilder) - p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) - return p.parseFragment(doc, container=container, **kwargs) - - -def method_decorator_metaclass(function): - class Decorated(type): - def __new__(meta, classname, bases, classDict): - for attributeName, attribute in classDict.items(): - if isinstance(attribute, types.FunctionType): - attribute = function(attribute) - - classDict[attributeName] = attribute - return type.__new__(meta, classname, bases, classDict) - return Decorated - - -class HTMLParser(object): - """HTML parser - - Generates a tree structure from a stream of (possibly malformed) HTML. - - """ - - def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): - """ - :arg tree: a treebuilder class controlling the type of tree that will be - returned. Built in treebuilders can be accessed through - html5lib.treebuilders.getTreeBuilder(treeType) - - :arg strict: raise an exception when a parse error is encountered - - :arg namespaceHTMLElements: whether or not to namespace HTML elements - - :arg debug: whether or not to enable debug mode which logs things - - Example: - - >>> from html5lib.html5parser import HTMLParser - >>> parser = HTMLParser() # generates parser with etree builder - >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict - - """ - - # Raise an exception on the first error encountered - self.strict = strict - - if tree is None: - tree = treebuilders.getTreeBuilder("etree") - self.tree = tree(namespaceHTMLElements) - self.errors = [] - - self.phases = dict([(name, cls(self, self.tree)) for name, cls in - getPhases(debug).items()]) - - def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): - - self.innerHTMLMode = innerHTML - self.container = container - self.scripting = scripting - self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) - self.reset() - - try: - self.mainLoop() - except _ReparseException: - self.reset() - self.mainLoop() - - def reset(self): - self.tree.reset() - self.firstStartTag = False - self.errors = [] - self.log = [] # only used with debug mode - # "quirks" / "limited quirks" / "no quirks" - self.compatMode = "no quirks" - - if self.innerHTMLMode: - self.innerHTML = self.container.lower() - - if self.innerHTML in cdataElements: - self.tokenizer.state = self.tokenizer.rcdataState - elif self.innerHTML in rcdataElements: - self.tokenizer.state = self.tokenizer.rawtextState - elif self.innerHTML == 'plaintext': - self.tokenizer.state = self.tokenizer.plaintextState - else: - # state already is data state - # self.tokenizer.state = self.tokenizer.dataState - pass - self.phase = self.phases["beforeHtml"] - self.phase.insertHtmlElement() - self.resetInsertionMode() - else: - self.innerHTML = False # pylint:disable=redefined-variable-type - self.phase = self.phases["initial"] - - self.lastPhase = None - - self.beforeRCDataPhase = None - - self.framesetOK = True - - @property - def documentEncoding(self): - """Name of the character encoding that was used to decode the input stream, or - :obj:`None` if that is not determined yet - - """ - if not hasattr(self, 'tokenizer'): - return None - return self.tokenizer.stream.charEncoding[0].name - - def isHTMLIntegrationPoint(self, element): - if (element.name == "annotation-xml" and - element.namespace == namespaces["mathml"]): - return ("encoding" in element.attributes and - element.attributes["encoding"].translate( - asciiUpper2Lower) in - ("text/html", "application/xhtml+xml")) - else: - return (element.namespace, element.name) in htmlIntegrationPointElements - - def isMathMLTextIntegrationPoint(self, element): - return (element.namespace, element.name) in mathmlTextIntegrationPointElements - - def mainLoop(self): - CharactersToken = tokenTypes["Characters"] - SpaceCharactersToken = tokenTypes["SpaceCharacters"] - StartTagToken = tokenTypes["StartTag"] - EndTagToken = tokenTypes["EndTag"] - CommentToken = tokenTypes["Comment"] - DoctypeToken = tokenTypes["Doctype"] - ParseErrorToken = tokenTypes["ParseError"] - - for token in self.normalizedTokens(): - prev_token = None - new_token = token - while new_token is not None: - prev_token = new_token - currentNode = self.tree.openElements[-1] if self.tree.openElements else None - currentNodeNamespace = currentNode.namespace if currentNode else None - currentNodeName = currentNode.name if currentNode else None - - type = new_token["type"] - - if type == ParseErrorToken: - self.parseError(new_token["data"], new_token.get("datavars", {})) - new_token = None - else: - if (len(self.tree.openElements) == 0 or - currentNodeNamespace == self.tree.defaultNamespace or - (self.isMathMLTextIntegrationPoint(currentNode) and - ((type == StartTagToken and - token["name"] not in frozenset(["mglyph", "malignmark"])) or - type in (CharactersToken, SpaceCharactersToken))) or - (currentNodeNamespace == namespaces["mathml"] and - currentNodeName == "annotation-xml" and - type == StartTagToken and - token["name"] == "svg") or - (self.isHTMLIntegrationPoint(currentNode) and - type in (StartTagToken, CharactersToken, SpaceCharactersToken))): - phase = self.phase - else: - phase = self.phases["inForeignContent"] - - if type == CharactersToken: - new_token = phase.processCharacters(new_token) - elif type == SpaceCharactersToken: - new_token = phase.processSpaceCharacters(new_token) - elif type == StartTagToken: - new_token = phase.processStartTag(new_token) - elif type == EndTagToken: - new_token = phase.processEndTag(new_token) - elif type == CommentToken: - new_token = phase.processComment(new_token) - elif type == DoctypeToken: - new_token = phase.processDoctype(new_token) - - if (type == StartTagToken and prev_token["selfClosing"] and - not prev_token["selfClosingAcknowledged"]): - self.parseError("non-void-element-with-trailing-solidus", - {"name": prev_token["name"]}) - - # When the loop finishes it's EOF - reprocess = True - phases = [] - while reprocess: - phases.append(self.phase) - reprocess = self.phase.processEOF() - if reprocess: - assert self.phase not in phases - - def normalizedTokens(self): - for token in self.tokenizer: - yield self.normalizeToken(token) - - def parse(self, stream, *args, **kwargs): - """Parse a HTML document into a well-formed tree - - :arg stream: a file-like object or string containing the HTML to be parsed - - The optional encoding parameter must be a string that indicates - the encoding. If specified, that encoding will be used, - regardless of any BOM or later declaration (such as in a meta - element). - - :arg scripting: treat noscript elements as if JavaScript was turned on - - :returns: parsed tree - - Example: - - >>> from html5lib.html5parser import HTMLParser - >>> parser = HTMLParser() - >>> parser.parse('<html><body><p>This is a doc</p></body></html>') - <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> - - """ - self._parse(stream, False, None, *args, **kwargs) - return self.tree.getDocument() - - def parseFragment(self, stream, *args, **kwargs): - """Parse a HTML fragment into a well-formed tree fragment - - :arg container: name of the element we're setting the innerHTML - property if set to None, default to 'div' - - :arg stream: a file-like object or string containing the HTML to be parsed - - The optional encoding parameter must be a string that indicates - the encoding. If specified, that encoding will be used, - regardless of any BOM or later declaration (such as in a meta - element) - - :arg scripting: treat noscript elements as if JavaScript was turned on - - :returns: parsed tree - - Example: - - >>> from html5lib.html5libparser import HTMLParser - >>> parser = HTMLParser() - >>> parser.parseFragment('<b>this is a fragment</b>') - <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> - - """ - self._parse(stream, True, *args, **kwargs) - return self.tree.getFragment() - - def parseError(self, errorcode="XXX-undefined-error", datavars=None): - # XXX The idea is to make errorcode mandatory. - if datavars is None: - datavars = {} - self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) - if self.strict: - raise ParseError(E[errorcode] % datavars) - - def normalizeToken(self, token): - # HTML5 specific normalizations to the token stream - if token["type"] == tokenTypes["StartTag"]: - raw = token["data"] - token["data"] = OrderedDict(raw) - if len(raw) > len(token["data"]): - # we had some duplicated attribute, fix so first wins - token["data"].update(raw[::-1]) - - return token - - def adjustMathMLAttributes(self, token): - adjust_attributes(token, adjustMathMLAttributes) - - def adjustSVGAttributes(self, token): - adjust_attributes(token, adjustSVGAttributes) - - def adjustForeignAttributes(self, token): - adjust_attributes(token, adjustForeignAttributesMap) - - def reparseTokenNormal(self, token): - # pylint:disable=unused-argument - self.parser.phase() - - def resetInsertionMode(self): - # The name of this method is mostly historical. (It's also used in the - # specification.) - last = False - newModes = { - "select": "inSelect", - "td": "inCell", - "th": "inCell", - "tr": "inRow", - "tbody": "inTableBody", - "thead": "inTableBody", - "tfoot": "inTableBody", - "caption": "inCaption", - "colgroup": "inColumnGroup", - "table": "inTable", - "head": "inBody", - "body": "inBody", - "frameset": "inFrameset", - "html": "beforeHead" - } - for node in self.tree.openElements[::-1]: - nodeName = node.name - new_phase = None - if node == self.tree.openElements[0]: - assert self.innerHTML - last = True - nodeName = self.innerHTML - # Check for conditions that should only happen in the innerHTML - # case - if nodeName in ("select", "colgroup", "head", "html"): - assert self.innerHTML - - if not last and node.namespace != self.tree.defaultNamespace: - continue - - if nodeName in newModes: - new_phase = self.phases[newModes[nodeName]] - break - elif last: - new_phase = self.phases["inBody"] - break - - self.phase = new_phase - - def parseRCDataRawtext(self, token, contentType): - # Generic RCDATA/RAWTEXT Parsing algorithm - assert contentType in ("RAWTEXT", "RCDATA") - - self.tree.insertElement(token) - - if contentType == "RAWTEXT": - self.tokenizer.state = self.tokenizer.rawtextState - else: - self.tokenizer.state = self.tokenizer.rcdataState - - self.originalPhase = self.phase - - self.phase = self.phases["text"] - - -@_utils.memoize -def getPhases(debug): - def log(function): - """Logger that records which phase processes each token""" - type_names = dict((value, key) for key, value in - tokenTypes.items()) - - def wrapped(self, *args, **kwargs): - if function.__name__.startswith("process") and len(args) > 0: - token = args[0] - try: - info = {"type": type_names[token['type']]} - except: - raise - if token['type'] in tagTokenTypes: - info["name"] = token['name'] - - self.parser.log.append((self.parser.tokenizer.state.__name__, - self.parser.phase.__class__.__name__, - self.__class__.__name__, - function.__name__, - info)) - return function(self, *args, **kwargs) - else: - return function(self, *args, **kwargs) - return wrapped - - def getMetaclass(use_metaclass, metaclass_func): - if use_metaclass: - return method_decorator_metaclass(metaclass_func) - else: - return type - - # pylint:disable=unused-argument - class Phase(with_metaclass(getMetaclass(debug, log))): - """Base class for helper object that implements each phase of processing - """ - - def __init__(self, parser, tree): - self.parser = parser - self.tree = tree - - def processEOF(self): - raise NotImplementedError - - def processComment(self, token): - # For most phases the following is correct. Where it's not it will be - # overridden. - self.tree.insertComment(token, self.tree.openElements[-1]) - - def processDoctype(self, token): - self.parser.parseError("unexpected-doctype") - - def processCharacters(self, token): - self.tree.insertText(token["data"]) - - def processSpaceCharacters(self, token): - self.tree.insertText(token["data"]) - - def processStartTag(self, token): - return self.startTagHandler[token["name"]](token) - - def startTagHtml(self, token): - if not self.parser.firstStartTag and token["name"] == "html": - self.parser.parseError("non-html-root") - # XXX Need a check here to see if the first start tag token emitted is - # this token... If it's not, invoke self.parser.parseError(). - for attr, value in token["data"].items(): - if attr not in self.tree.openElements[0].attributes: - self.tree.openElements[0].attributes[attr] = value - self.parser.firstStartTag = False - - def processEndTag(self, token): - return self.endTagHandler[token["name"]](token) - - class InitialPhase(Phase): - def processSpaceCharacters(self, token): - pass - - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) - - def processDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - correct = token["correct"] - - if (name != "html" or publicId is not None or - systemId is not None and systemId != "about:legacy-compat"): - self.parser.parseError("unknown-doctype") - - if publicId is None: - publicId = "" - - self.tree.insertDoctype(token) - - if publicId != "": - publicId = publicId.translate(asciiUpper2Lower) - - if (not correct or token["name"] != "html" or - publicId.startswith( - ("+//silmaril//dtd html pro v0r11 19970101//", - "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", - "-//as//dtd html 3.0 aswedit + extensions//", - "-//ietf//dtd html 2.0 level 1//", - "-//ietf//dtd html 2.0 level 2//", - "-//ietf//dtd html 2.0 strict level 1//", - "-//ietf//dtd html 2.0 strict level 2//", - "-//ietf//dtd html 2.0 strict//", - "-//ietf//dtd html 2.0//", - "-//ietf//dtd html 2.1e//", - "-//ietf//dtd html 3.0//", - "-//ietf//dtd html 3.2 final//", - "-//ietf//dtd html 3.2//", - "-//ietf//dtd html 3//", - "-//ietf//dtd html level 0//", - "-//ietf//dtd html level 1//", - "-//ietf//dtd html level 2//", - "-//ietf//dtd html level 3//", - "-//ietf//dtd html strict level 0//", - "-//ietf//dtd html strict level 1//", - "-//ietf//dtd html strict level 2//", - "-//ietf//dtd html strict level 3//", - "-//ietf//dtd html strict//", - "-//ietf//dtd html//", - "-//metrius//dtd metrius presentational//", - "-//microsoft//dtd internet explorer 2.0 html strict//", - "-//microsoft//dtd internet explorer 2.0 html//", - "-//microsoft//dtd internet explorer 2.0 tables//", - "-//microsoft//dtd internet explorer 3.0 html strict//", - "-//microsoft//dtd internet explorer 3.0 html//", - "-//microsoft//dtd internet explorer 3.0 tables//", - "-//netscape comm. corp.//dtd html//", - "-//netscape comm. corp.//dtd strict html//", - "-//o'reilly and associates//dtd html 2.0//", - "-//o'reilly and associates//dtd html extended 1.0//", - "-//o'reilly and associates//dtd html extended relaxed 1.0//", - "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", - "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", - "-//spyglass//dtd html 2.0 extended//", - "-//sq//dtd html 2.0 hotmetal + extensions//", - "-//sun microsystems corp.//dtd hotjava html//", - "-//sun microsystems corp.//dtd hotjava strict html//", - "-//w3c//dtd html 3 1995-03-24//", - "-//w3c//dtd html 3.2 draft//", - "-//w3c//dtd html 3.2 final//", - "-//w3c//dtd html 3.2//", - "-//w3c//dtd html 3.2s draft//", - "-//w3c//dtd html 4.0 frameset//", - "-//w3c//dtd html 4.0 transitional//", - "-//w3c//dtd html experimental 19960712//", - "-//w3c//dtd html experimental 970421//", - "-//w3c//dtd w3 html//", - "-//w3o//dtd w3 html 3.0//", - "-//webtechs//dtd mozilla html 2.0//", - "-//webtechs//dtd mozilla html//")) or - publicId in ("-//w3o//dtd w3 html strict 3.0//en//", - "-/w3c/dtd html 4.0 transitional/en", - "html") or - publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is None or - systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): - self.parser.compatMode = "quirks" - elif (publicId.startswith( - ("-//w3c//dtd xhtml 1.0 frameset//", - "-//w3c//dtd xhtml 1.0 transitional//")) or - publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is not None): - self.parser.compatMode = "limited quirks" - - self.parser.phase = self.parser.phases["beforeHtml"] - - def anythingElse(self): - self.parser.compatMode = "quirks" - self.parser.phase = self.parser.phases["beforeHtml"] - - def processCharacters(self, token): - self.parser.parseError("expected-doctype-but-got-chars") - self.anythingElse() - return token - - def processStartTag(self, token): - self.parser.parseError("expected-doctype-but-got-start-tag", - {"name": token["name"]}) - self.anythingElse() - return token - - def processEndTag(self, token): - self.parser.parseError("expected-doctype-but-got-end-tag", - {"name": token["name"]}) - self.anythingElse() - return token - - def processEOF(self): - self.parser.parseError("expected-doctype-but-got-eof") - self.anythingElse() - return True - - class BeforeHtmlPhase(Phase): - # helper methods - def insertHtmlElement(self): - self.tree.insertRoot(impliedTagToken("html", "StartTag")) - self.parser.phase = self.parser.phases["beforeHead"] - - # other - def processEOF(self): - self.insertHtmlElement() - return True - - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) - - def processSpaceCharacters(self, token): - pass - - def processCharacters(self, token): - self.insertHtmlElement() - return token - - def processStartTag(self, token): - if token["name"] == "html": - self.parser.firstStartTag = True - self.insertHtmlElement() - return token - - def processEndTag(self, token): - if token["name"] not in ("head", "body", "html", "br"): - self.parser.parseError("unexpected-end-tag-before-html", - {"name": token["name"]}) - else: - self.insertHtmlElement() - return token - - class BeforeHeadPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("head", self.startTagHead) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - (("head", "body", "html", "br"), self.endTagImplyHead) - ]) - self.endTagHandler.default = self.endTagOther - - def processEOF(self): - self.startTagHead(impliedTagToken("head", "StartTag")) - return True - - def processSpaceCharacters(self, token): - pass - - def processCharacters(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token - - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def startTagHead(self, token): - self.tree.insertElement(token) - self.tree.headPointer = self.tree.openElements[-1] - self.parser.phase = self.parser.phases["inHead"] - - def startTagOther(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token - - def endTagImplyHead(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token - - def endTagOther(self, token): - self.parser.parseError("end-tag-after-implied-root", - {"name": token["name"]}) - - class InHeadPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("title", self.startTagTitle), - (("noframes", "style"), self.startTagNoFramesStyle), - ("noscript", self.startTagNoscript), - ("script", self.startTagScript), - (("base", "basefont", "bgsound", "command", "link"), - self.startTagBaseLinkCommand), - ("meta", self.startTagMeta), - ("head", self.startTagHead) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("head", self.endTagHead), - (("br", "html", "body"), self.endTagHtmlBodyBr) - ]) - self.endTagHandler.default = self.endTagOther - - # the real thing - def processEOF(self): - self.anythingElse() - return True - - def processCharacters(self, token): - self.anythingElse() - return token - - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def startTagHead(self, token): - self.parser.parseError("two-heads-are-not-better-than-one") - - def startTagBaseLinkCommand(self, token): - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def startTagMeta(self, token): - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - attributes = token["data"] - if self.parser.tokenizer.stream.charEncoding[1] == "tentative": - if "charset" in attributes: - self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) - elif ("content" in attributes and - "http-equiv" in attributes and - attributes["http-equiv"].lower() == "content-type"): - # Encoding it as UTF-8 here is a hack, as really we should pass - # the abstract Unicode string, and just use the - # ContentAttrParser on that, but using UTF-8 allows all chars - # to be encoded and as a ASCII-superset works. - data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) - parser = _inputstream.ContentAttrParser(data) - codec = parser.parse() - self.parser.tokenizer.stream.changeEncoding(codec) - - def startTagTitle(self, token): - self.parser.parseRCDataRawtext(token, "RCDATA") - - def startTagNoFramesStyle(self, token): - # Need to decide whether to implement the scripting-disabled case - self.parser.parseRCDataRawtext(token, "RAWTEXT") - - def startTagNoscript(self, token): - if self.parser.scripting: - self.parser.parseRCDataRawtext(token, "RAWTEXT") - else: - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inHeadNoscript"] - - def startTagScript(self, token): - self.tree.insertElement(token) - self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState - self.parser.originalPhase = self.parser.phase - self.parser.phase = self.parser.phases["text"] - - def startTagOther(self, token): - self.anythingElse() - return token - - def endTagHead(self, token): - node = self.parser.tree.openElements.pop() - assert node.name == "head", "Expected head got %s" % node.name - self.parser.phase = self.parser.phases["afterHead"] - - def endTagHtmlBodyBr(self, token): - self.anythingElse() - return token - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - def anythingElse(self): - self.endTagHead(impliedTagToken("head")) - - class InHeadNoscriptPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand), - (("head", "noscript"), self.startTagHeadNoscript), - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("noscript", self.endTagNoscript), - ("br", self.endTagBr), - ]) - self.endTagHandler.default = self.endTagOther - - def processEOF(self): - self.parser.parseError("eof-in-head-noscript") - self.anythingElse() - return True - - def processComment(self, token): - return self.parser.phases["inHead"].processComment(token) - - def processCharacters(self, token): - self.parser.parseError("char-in-head-noscript") - self.anythingElse() - return token - - def processSpaceCharacters(self, token): - return self.parser.phases["inHead"].processSpaceCharacters(token) - - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def startTagBaseLinkCommand(self, token): - return self.parser.phases["inHead"].processStartTag(token) - - def startTagHeadNoscript(self, token): - self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) - - def startTagOther(self, token): - self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) - self.anythingElse() - return token - - def endTagNoscript(self, token): - node = self.parser.tree.openElements.pop() - assert node.name == "noscript", "Expected noscript got %s" % node.name - self.parser.phase = self.parser.phases["inHead"] - - def endTagBr(self, token): - self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) - self.anythingElse() - return token - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - def anythingElse(self): - # Caller must raise parse error first! - self.endTagNoscript(impliedTagToken("noscript")) - - class AfterHeadPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("body", self.startTagBody), - ("frameset", self.startTagFrameset), - (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", - "style", "title"), - self.startTagFromHead), - ("head", self.startTagHead) - ]) - self.startTagHandler.default = self.startTagOther - self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), - self.endTagHtmlBodyBr)]) - self.endTagHandler.default = self.endTagOther - - def processEOF(self): - self.anythingElse() - return True - - def processCharacters(self, token): - self.anythingElse() - return token - - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def startTagBody(self, token): - self.parser.framesetOK = False - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inBody"] - - def startTagFrameset(self, token): - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inFrameset"] - - def startTagFromHead(self, token): - self.parser.parseError("unexpected-start-tag-out-of-my-head", - {"name": token["name"]}) - self.tree.openElements.append(self.tree.headPointer) - self.parser.phases["inHead"].processStartTag(token) - for node in self.tree.openElements[::-1]: - if node.name == "head": - self.tree.openElements.remove(node) - break - - def startTagHead(self, token): - self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) - - def startTagOther(self, token): - self.anythingElse() - return token - - def endTagHtmlBodyBr(self, token): - self.anythingElse() - return token - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - def anythingElse(self): - self.tree.insertElement(impliedTagToken("body", "StartTag")) - self.parser.phase = self.parser.phases["inBody"] - self.parser.framesetOK = True - - class InBodyPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody - # the really-really-really-very crazy mode - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - # Set this to the default handler - self.processSpaceCharacters = self.processSpaceCharactersNonPre - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("base", "basefont", "bgsound", "command", "link", "meta", - "script", "style", "title"), - self.startTagProcessInHead), - ("body", self.startTagBody), - ("frameset", self.startTagFrameset), - (("address", "article", "aside", "blockquote", "center", "details", - "dir", "div", "dl", "fieldset", "figcaption", "figure", - "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", - "section", "summary", "ul"), - self.startTagCloseP), - (headingElements, self.startTagHeading), - (("pre", "listing"), self.startTagPreListing), - ("form", self.startTagForm), - (("li", "dd", "dt"), self.startTagListItem), - ("plaintext", self.startTagPlaintext), - ("a", self.startTagA), - (("b", "big", "code", "em", "font", "i", "s", "small", "strike", - "strong", "tt", "u"), self.startTagFormatting), - ("nobr", self.startTagNobr), - ("button", self.startTagButton), - (("applet", "marquee", "object"), self.startTagAppletMarqueeObject), - ("xmp", self.startTagXmp), - ("table", self.startTagTable), - (("area", "br", "embed", "img", "keygen", "wbr"), - self.startTagVoidFormatting), - (("param", "source", "track"), self.startTagParamSource), - ("input", self.startTagInput), - ("hr", self.startTagHr), - ("image", self.startTagImage), - ("isindex", self.startTagIsIndex), - ("textarea", self.startTagTextarea), - ("iframe", self.startTagIFrame), - ("noscript", self.startTagNoscript), - (("noembed", "noframes"), self.startTagRawtext), - ("select", self.startTagSelect), - (("rp", "rt"), self.startTagRpRt), - (("option", "optgroup"), self.startTagOpt), - (("math"), self.startTagMath), - (("svg"), self.startTagSvg), - (("caption", "col", "colgroup", "frame", "head", - "tbody", "td", "tfoot", "th", "thead", - "tr"), self.startTagMisplaced) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("body", self.endTagBody), - ("html", self.endTagHtml), - (("address", "article", "aside", "blockquote", "button", "center", - "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", - "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", - "section", "summary", "ul"), self.endTagBlock), - ("form", self.endTagForm), - ("p", self.endTagP), - (("dd", "dt", "li"), self.endTagListItem), - (headingElements, self.endTagHeading), - (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", - "strike", "strong", "tt", "u"), self.endTagFormatting), - (("applet", "marquee", "object"), self.endTagAppletMarqueeObject), - ("br", self.endTagBr), - ]) - self.endTagHandler.default = self.endTagOther - - def isMatchingFormattingElement(self, node1, node2): - return (node1.name == node2.name and - node1.namespace == node2.namespace and - node1.attributes == node2.attributes) - - # helper - def addFormattingElement(self, token): - self.tree.insertElement(token) - element = self.tree.openElements[-1] - - matchingElements = [] - for node in self.tree.activeFormattingElements[::-1]: - if node is Marker: - break - elif self.isMatchingFormattingElement(node, element): - matchingElements.append(node) - - assert len(matchingElements) <= 3 - if len(matchingElements) == 3: - self.tree.activeFormattingElements.remove(matchingElements[-1]) - self.tree.activeFormattingElements.append(element) - - # the real deal - def processEOF(self): - allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", - "tfoot", "th", "thead", "tr", "body", - "html")) - for node in self.tree.openElements[::-1]: - if node.name not in allowed_elements: - self.parser.parseError("expected-closing-tag-but-got-eof") - break - # Stop parsing - - def processSpaceCharactersDropNewline(self, token): - # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we - # want to drop leading newlines - data = token["data"] - self.processSpaceCharacters = self.processSpaceCharactersNonPre - if (data.startswith("\n") and - self.tree.openElements[-1].name in ("pre", "listing", "textarea") and - not self.tree.openElements[-1].hasContent()): - data = data[1:] - if data: - self.tree.reconstructActiveFormattingElements() - self.tree.insertText(data) - - def processCharacters(self, token): - if token["data"] == "\u0000": - # The tokenizer should always emit null on its own - return - self.tree.reconstructActiveFormattingElements() - self.tree.insertText(token["data"]) - # This must be bad for performance - if (self.parser.framesetOK and - any([char not in spaceCharacters - for char in token["data"]])): - self.parser.framesetOK = False - - def processSpaceCharactersNonPre(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertText(token["data"]) - - def startTagProcessInHead(self, token): - return self.parser.phases["inHead"].processStartTag(token) - - def startTagBody(self, token): - self.parser.parseError("unexpected-start-tag", {"name": "body"}) - if (len(self.tree.openElements) == 1 or - self.tree.openElements[1].name != "body"): - assert self.parser.innerHTML - else: - self.parser.framesetOK = False - for attr, value in token["data"].items(): - if attr not in self.tree.openElements[1].attributes: - self.tree.openElements[1].attributes[attr] = value - - def startTagFrameset(self, token): - self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) - if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): - assert self.parser.innerHTML - elif not self.parser.framesetOK: - pass - else: - if self.tree.openElements[1].parent: - self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) - while self.tree.openElements[-1].name != "html": - self.tree.openElements.pop() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inFrameset"] - - def startTagCloseP(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) - - def startTagPreListing(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) - self.parser.framesetOK = False - self.processSpaceCharacters = self.processSpaceCharactersDropNewline - - def startTagForm(self, token): - if self.tree.formPointer: - self.parser.parseError("unexpected-start-tag", {"name": "form"}) - else: - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) - self.tree.formPointer = self.tree.openElements[-1] - - def startTagListItem(self, token): - self.parser.framesetOK = False - - stopNamesMap = {"li": ["li"], - "dt": ["dt", "dd"], - "dd": ["dt", "dd"]} - stopNames = stopNamesMap[token["name"]] - for node in reversed(self.tree.openElements): - if node.name in stopNames: - self.parser.phase.processEndTag( - impliedTagToken(node.name, "EndTag")) - break - if (node.nameTuple in specialElements and - node.name not in ("address", "div", "p")): - break - - if self.tree.elementInScope("p", variant="button"): - self.parser.phase.processEndTag( - impliedTagToken("p", "EndTag")) - - self.tree.insertElement(token) - - def startTagPlaintext(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) - self.parser.tokenizer.state = self.parser.tokenizer.plaintextState - - def startTagHeading(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - if self.tree.openElements[-1].name in headingElements: - self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) - self.tree.openElements.pop() - self.tree.insertElement(token) - - def startTagA(self, token): - afeAElement = self.tree.elementInActiveFormattingElements("a") - if afeAElement: - self.parser.parseError("unexpected-start-tag-implies-end-tag", - {"startName": "a", "endName": "a"}) - self.endTagFormatting(impliedTagToken("a")) - if afeAElement in self.tree.openElements: - self.tree.openElements.remove(afeAElement) - if afeAElement in self.tree.activeFormattingElements: - self.tree.activeFormattingElements.remove(afeAElement) - self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(token) - - def startTagFormatting(self, token): - self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(token) - - def startTagNobr(self, token): - self.tree.reconstructActiveFormattingElements() - if self.tree.elementInScope("nobr"): - self.parser.parseError("unexpected-start-tag-implies-end-tag", - {"startName": "nobr", "endName": "nobr"}) - self.processEndTag(impliedTagToken("nobr")) - # XXX Need tests that trigger the following - self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(token) - - def startTagButton(self, token): - if self.tree.elementInScope("button"): - self.parser.parseError("unexpected-start-tag-implies-end-tag", - {"startName": "button", "endName": "button"}) - self.processEndTag(impliedTagToken("button")) - return token - else: - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - self.parser.framesetOK = False - - def startTagAppletMarqueeObject(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - self.tree.activeFormattingElements.append(Marker) - self.parser.framesetOK = False - - def startTagXmp(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.reconstructActiveFormattingElements() - self.parser.framesetOK = False - self.parser.parseRCDataRawtext(token, "RAWTEXT") - - def startTagTable(self, token): - if self.parser.compatMode != "quirks": - if self.tree.elementInScope("p", variant="button"): - self.processEndTag(impliedTagToken("p")) - self.tree.insertElement(token) - self.parser.framesetOK = False - self.parser.phase = self.parser.phases["inTable"] - - def startTagVoidFormatting(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - self.parser.framesetOK = False - - def startTagInput(self, token): - framesetOK = self.parser.framesetOK - self.startTagVoidFormatting(token) - if ("type" in token["data"] and - token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): - # input type=hidden doesn't change framesetOK - self.parser.framesetOK = framesetOK - - def startTagParamSource(self, token): - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def startTagHr(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - self.parser.framesetOK = False - - def startTagImage(self, token): - # No really... - self.parser.parseError("unexpected-start-tag-treated-as", - {"originalName": "image", "newName": "img"}) - self.processStartTag(impliedTagToken("img", "StartTag", - attributes=token["data"], - selfClosing=token["selfClosing"])) - - def startTagIsIndex(self, token): - self.parser.parseError("deprecated-tag", {"name": "isindex"}) - if self.tree.formPointer: - return - form_attrs = {} - if "action" in token["data"]: - form_attrs["action"] = token["data"]["action"] - self.processStartTag(impliedTagToken("form", "StartTag", - attributes=form_attrs)) - self.processStartTag(impliedTagToken("hr", "StartTag")) - self.processStartTag(impliedTagToken("label", "StartTag")) - # XXX Localization ... - if "prompt" in token["data"]: - prompt = token["data"]["prompt"] - else: - prompt = "This is a searchable index. Enter search keywords: " - self.processCharacters( - {"type": tokenTypes["Characters"], "data": prompt}) - attributes = token["data"].copy() - if "action" in attributes: - del attributes["action"] - if "prompt" in attributes: - del attributes["prompt"] - attributes["name"] = "isindex" - self.processStartTag(impliedTagToken("input", "StartTag", - attributes=attributes, - selfClosing=token["selfClosing"])) - self.processEndTag(impliedTagToken("label")) - self.processStartTag(impliedTagToken("hr", "StartTag")) - self.processEndTag(impliedTagToken("form")) - - def startTagTextarea(self, token): - self.tree.insertElement(token) - self.parser.tokenizer.state = self.parser.tokenizer.rcdataState - self.processSpaceCharacters = self.processSpaceCharactersDropNewline - self.parser.framesetOK = False - - def startTagIFrame(self, token): - self.parser.framesetOK = False - self.startTagRawtext(token) - - def startTagNoscript(self, token): - if self.parser.scripting: - self.startTagRawtext(token) - else: - self.startTagOther(token) - - def startTagRawtext(self, token): - """iframe, noembed noframes, noscript(if scripting enabled)""" - self.parser.parseRCDataRawtext(token, "RAWTEXT") - - def startTagOpt(self, token): - if self.tree.openElements[-1].name == "option": - self.parser.phase.processEndTag(impliedTagToken("option")) - self.tree.reconstructActiveFormattingElements() - self.parser.tree.insertElement(token) - - def startTagSelect(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - self.parser.framesetOK = False - if self.parser.phase in (self.parser.phases["inTable"], - self.parser.phases["inCaption"], - self.parser.phases["inColumnGroup"], - self.parser.phases["inTableBody"], - self.parser.phases["inRow"], - self.parser.phases["inCell"]): - self.parser.phase = self.parser.phases["inSelectInTable"] - else: - self.parser.phase = self.parser.phases["inSelect"] - - def startTagRpRt(self, token): - if self.tree.elementInScope("ruby"): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != "ruby": - self.parser.parseError() - self.tree.insertElement(token) - - def startTagMath(self, token): - self.tree.reconstructActiveFormattingElements() - self.parser.adjustMathMLAttributes(token) - self.parser.adjustForeignAttributes(token) - token["namespace"] = namespaces["mathml"] - self.tree.insertElement(token) - # Need to get the parse error right for the case where the token - # has a namespace not equal to the xmlns attribute - if token["selfClosing"]: - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def startTagSvg(self, token): - self.tree.reconstructActiveFormattingElements() - self.parser.adjustSVGAttributes(token) - self.parser.adjustForeignAttributes(token) - token["namespace"] = namespaces["svg"] - self.tree.insertElement(token) - # Need to get the parse error right for the case where the token - # has a namespace not equal to the xmlns attribute - if token["selfClosing"]: - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def startTagMisplaced(self, token): - """ Elements that should be children of other elements that have a - different insertion mode; here they are ignored - "caption", "col", "colgroup", "frame", "frameset", "head", - "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", - "tr", "noscript" - """ - self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) - - def startTagOther(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - - def endTagP(self, token): - if not self.tree.elementInScope("p", variant="button"): - self.startTagCloseP(impliedTagToken("p", "StartTag")) - self.parser.parseError("unexpected-end-tag", {"name": "p"}) - self.endTagP(impliedTagToken("p", "EndTag")) - else: - self.tree.generateImpliedEndTags("p") - if self.tree.openElements[-1].name != "p": - self.parser.parseError("unexpected-end-tag", {"name": "p"}) - node = self.tree.openElements.pop() - while node.name != "p": - node = self.tree.openElements.pop() - - def endTagBody(self, token): - if not self.tree.elementInScope("body"): - self.parser.parseError() - return - elif self.tree.openElements[-1].name != "body": - for node in self.tree.openElements[2:]: - if node.name not in frozenset(("dd", "dt", "li", "optgroup", - "option", "p", "rp", "rt", - "tbody", "td", "tfoot", - "th", "thead", "tr", "body", - "html")): - # Not sure this is the correct name for the parse error - self.parser.parseError( - "expected-one-end-tag-but-got-another", - {"gotName": "body", "expectedName": node.name}) - break - self.parser.phase = self.parser.phases["afterBody"] - - def endTagHtml(self, token): - # We repeat the test for the body end tag token being ignored here - if self.tree.elementInScope("body"): - self.endTagBody(impliedTagToken("body")) - return token - - def endTagBlock(self, token): - # Put us back in the right whitespace handling mode - if token["name"] == "pre": - self.processSpaceCharacters = self.processSpaceCharactersNonPre - inScope = self.tree.elementInScope(token["name"]) - if inScope: - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - if inScope: - node = self.tree.openElements.pop() - while node.name != token["name"]: - node = self.tree.openElements.pop() - - def endTagForm(self, token): - node = self.tree.formPointer - self.tree.formPointer = None - if node is None or not self.tree.elementInScope(node): - self.parser.parseError("unexpected-end-tag", - {"name": "form"}) - else: - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1] != node: - self.parser.parseError("end-tag-too-early-ignored", - {"name": "form"}) - self.tree.openElements.remove(node) - - def endTagListItem(self, token): - if token["name"] == "li": - variant = "list" - else: - variant = None - if not self.tree.elementInScope(token["name"], variant=variant): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - else: - self.tree.generateImpliedEndTags(exclude=token["name"]) - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError( - "end-tag-too-early", - {"name": token["name"]}) - node = self.tree.openElements.pop() - while node.name != token["name"]: - node = self.tree.openElements.pop() - - def endTagHeading(self, token): - for item in headingElements: - if self.tree.elementInScope(item): - self.tree.generateImpliedEndTags() - break - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - - for item in headingElements: - if self.tree.elementInScope(item): - item = self.tree.openElements.pop() - while item.name not in headingElements: - item = self.tree.openElements.pop() - break - - def endTagFormatting(self, token): - """The much-feared adoption agency algorithm""" - # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 - # XXX Better parseError messages appreciated. - - # Step 1 - outerLoopCounter = 0 - - # Step 2 - while outerLoopCounter < 8: - - # Step 3 - outerLoopCounter += 1 - - # Step 4: - - # Let the formatting element be the last element in - # the list of active formatting elements that: - # - is between the end of the list and the last scope - # marker in the list, if any, or the start of the list - # otherwise, and - # - has the same tag name as the token. - formattingElement = self.tree.elementInActiveFormattingElements( - token["name"]) - if (not formattingElement or - (formattingElement in self.tree.openElements and - not self.tree.elementInScope(formattingElement.name))): - # If there is no such node, then abort these steps - # and instead act as described in the "any other - # end tag" entry below. - self.endTagOther(token) - return - - # Otherwise, if there is such a node, but that node is - # not in the stack of open elements, then this is a - # parse error; remove the element from the list, and - # abort these steps. - elif formattingElement not in self.tree.openElements: - self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) - self.tree.activeFormattingElements.remove(formattingElement) - return - - # Otherwise, if there is such a node, and that node is - # also in the stack of open elements, but the element - # is not in scope, then this is a parse error; ignore - # the token, and abort these steps. - elif not self.tree.elementInScope(formattingElement.name): - self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) - return - - # Otherwise, there is a formatting element and that - # element is in the stack and is in scope. If the - # element is not the current node, this is a parse - # error. In any case, proceed with the algorithm as - # written in the following steps. - else: - if formattingElement != self.tree.openElements[-1]: - self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) - - # Step 5: - - # Let the furthest block be the topmost node in the - # stack of open elements that is lower in the stack - # than the formatting element, and is an element in - # the special category. There might not be one. - afeIndex = self.tree.openElements.index(formattingElement) - furthestBlock = None - for element in self.tree.openElements[afeIndex:]: - if element.nameTuple in specialElements: - furthestBlock = element - break - - # Step 6: - - # If there is no furthest block, then the UA must - # first pop all the nodes from the bottom of the stack - # of open elements, from the current node up to and - # including the formatting element, then remove the - # formatting element from the list of active - # formatting elements, and finally abort these steps. - if furthestBlock is None: - element = self.tree.openElements.pop() - while element != formattingElement: - element = self.tree.openElements.pop() - self.tree.activeFormattingElements.remove(element) - return - - # Step 7 - commonAncestor = self.tree.openElements[afeIndex - 1] - - # Step 8: - # The bookmark is supposed to help us identify where to reinsert - # nodes in step 15. We have to ensure that we reinsert nodes after - # the node before the active formatting element. Note the bookmark - # can move in step 9.7 - bookmark = self.tree.activeFormattingElements.index(formattingElement) - - # Step 9 - lastNode = node = furthestBlock - innerLoopCounter = 0 - - index = self.tree.openElements.index(node) - while innerLoopCounter < 3: - innerLoopCounter += 1 - # Node is element before node in open elements - index -= 1 - node = self.tree.openElements[index] - if node not in self.tree.activeFormattingElements: - self.tree.openElements.remove(node) - continue - # Step 9.6 - if node == formattingElement: - break - # Step 9.7 - if lastNode == furthestBlock: - bookmark = self.tree.activeFormattingElements.index(node) + 1 - # Step 9.8 - clone = node.cloneNode() - # Replace node with clone - self.tree.activeFormattingElements[ - self.tree.activeFormattingElements.index(node)] = clone - self.tree.openElements[ - self.tree.openElements.index(node)] = clone - node = clone - # Step 9.9 - # Remove lastNode from its parents, if any - if lastNode.parent: - lastNode.parent.removeChild(lastNode) - node.appendChild(lastNode) - # Step 9.10 - lastNode = node - - # Step 10 - # Foster parent lastNode if commonAncestor is a - # table, tbody, tfoot, thead, or tr we need to foster - # parent the lastNode - if lastNode.parent: - lastNode.parent.removeChild(lastNode) - - if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): - parent, insertBefore = self.tree.getTableMisnestedNodePosition() - parent.insertBefore(lastNode, insertBefore) - else: - commonAncestor.appendChild(lastNode) - - # Step 11 - clone = formattingElement.cloneNode() - - # Step 12 - furthestBlock.reparentChildren(clone) - - # Step 13 - furthestBlock.appendChild(clone) - - # Step 14 - self.tree.activeFormattingElements.remove(formattingElement) - self.tree.activeFormattingElements.insert(bookmark, clone) - - # Step 15 - self.tree.openElements.remove(formattingElement) - self.tree.openElements.insert( - self.tree.openElements.index(furthestBlock) + 1, clone) - - def endTagAppletMarqueeObject(self, token): - if self.tree.elementInScope(token["name"]): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - - if self.tree.elementInScope(token["name"]): - element = self.tree.openElements.pop() - while element.name != token["name"]: - element = self.tree.openElements.pop() - self.tree.clearActiveFormattingElements() - - def endTagBr(self, token): - self.parser.parseError("unexpected-end-tag-treated-as", - {"originalName": "br", "newName": "br element"}) - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(impliedTagToken("br", "StartTag")) - self.tree.openElements.pop() - - def endTagOther(self, token): - for node in self.tree.openElements[::-1]: - if node.name == token["name"]: - self.tree.generateImpliedEndTags(exclude=token["name"]) - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - while self.tree.openElements.pop() != node: - pass - break - else: - if node.nameTuple in specialElements: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - break - - class TextPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([]) - self.startTagHandler.default = self.startTagOther - self.endTagHandler = _utils.MethodDispatcher([ - ("script", self.endTagScript)]) - self.endTagHandler.default = self.endTagOther - - def processCharacters(self, token): - self.tree.insertText(token["data"]) - - def processEOF(self): - self.parser.parseError("expected-named-closing-tag-but-got-eof", - {"name": self.tree.openElements[-1].name}) - self.tree.openElements.pop() - self.parser.phase = self.parser.originalPhase - return True - - def startTagOther(self, token): - assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] - - def endTagScript(self, token): - node = self.tree.openElements.pop() - assert node.name == "script" - self.parser.phase = self.parser.originalPhase - # The rest of this method is all stuff that only happens if - # document.write works - - def endTagOther(self, token): - self.tree.openElements.pop() - self.parser.phase = self.parser.originalPhase - - class InTablePhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-table - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("caption", self.startTagCaption), - ("colgroup", self.startTagColgroup), - ("col", self.startTagCol), - (("tbody", "tfoot", "thead"), self.startTagRowGroup), - (("td", "th", "tr"), self.startTagImplyTbody), - ("table", self.startTagTable), - (("style", "script"), self.startTagStyleScript), - ("input", self.startTagInput), - ("form", self.startTagForm) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("table", self.endTagTable), - (("body", "caption", "col", "colgroup", "html", "tbody", "td", - "tfoot", "th", "thead", "tr"), self.endTagIgnore) - ]) - self.endTagHandler.default = self.endTagOther - - # helper methods - def clearStackToTableContext(self): - # "clear the stack back to a table context" - while self.tree.openElements[-1].name not in ("table", "html"): - # self.parser.parseError("unexpected-implied-end-tag-in-table", - # {"name": self.tree.openElements[-1].name}) - self.tree.openElements.pop() - # When the current node is <html> it's an innerHTML case - - # processing methods - def processEOF(self): - if self.tree.openElements[-1].name != "html": - self.parser.parseError("eof-in-table") - else: - assert self.parser.innerHTML - # Stop parsing - - def processSpaceCharacters(self, token): - originalPhase = self.parser.phase - self.parser.phase = self.parser.phases["inTableText"] - self.parser.phase.originalPhase = originalPhase - self.parser.phase.processSpaceCharacters(token) - - def processCharacters(self, token): - originalPhase = self.parser.phase - self.parser.phase = self.parser.phases["inTableText"] - self.parser.phase.originalPhase = originalPhase - self.parser.phase.processCharacters(token) - - def insertText(self, token): - # If we get here there must be at least one non-whitespace character - # Do the table magic! - self.tree.insertFromTable = True - self.parser.phases["inBody"].processCharacters(token) - self.tree.insertFromTable = False - - def startTagCaption(self, token): - self.clearStackToTableContext() - self.tree.activeFormattingElements.append(Marker) - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inCaption"] - - def startTagColgroup(self, token): - self.clearStackToTableContext() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inColumnGroup"] - - def startTagCol(self, token): - self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) - return token - - def startTagRowGroup(self, token): - self.clearStackToTableContext() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inTableBody"] - - def startTagImplyTbody(self, token): - self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) - return token - - def startTagTable(self, token): - self.parser.parseError("unexpected-start-tag-implies-end-tag", - {"startName": "table", "endName": "table"}) - self.parser.phase.processEndTag(impliedTagToken("table")) - if not self.parser.innerHTML: - return token - - def startTagStyleScript(self, token): - return self.parser.phases["inHead"].processStartTag(token) - - def startTagInput(self, token): - if ("type" in token["data"] and - token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): - self.parser.parseError("unexpected-hidden-input-in-table") - self.tree.insertElement(token) - # XXX associate with form - self.tree.openElements.pop() - else: - self.startTagOther(token) - - def startTagForm(self, token): - self.parser.parseError("unexpected-form-in-table") - if self.tree.formPointer is None: - self.tree.insertElement(token) - self.tree.formPointer = self.tree.openElements[-1] - self.tree.openElements.pop() - - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) - # Do the table magic! - self.tree.insertFromTable = True - self.parser.phases["inBody"].processStartTag(token) - self.tree.insertFromTable = False - - def endTagTable(self, token): - if self.tree.elementInScope("table", variant="table"): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != "table": - self.parser.parseError("end-tag-too-early-named", - {"gotName": "table", - "expectedName": self.tree.openElements[-1].name}) - while self.tree.openElements[-1].name != "table": - self.tree.openElements.pop() - self.tree.openElements.pop() - self.parser.resetInsertionMode() - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) - # Do the table magic! - self.tree.insertFromTable = True - self.parser.phases["inBody"].processEndTag(token) - self.tree.insertFromTable = False - - class InTableTextPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.originalPhase = None - self.characterTokens = [] - - def flushCharacters(self): - data = "".join([item["data"] for item in self.characterTokens]) - if any([item not in spaceCharacters for item in data]): - token = {"type": tokenTypes["Characters"], "data": data} - self.parser.phases["inTable"].insertText(token) - elif data: - self.tree.insertText(data) - self.characterTokens = [] - - def processComment(self, token): - self.flushCharacters() - self.parser.phase = self.originalPhase - return token - - def processEOF(self): - self.flushCharacters() - self.parser.phase = self.originalPhase - return True - - def processCharacters(self, token): - if token["data"] == "\u0000": - return - self.characterTokens.append(token) - - def processSpaceCharacters(self, token): - # pretty sure we should never reach here - self.characterTokens.append(token) - # assert False - - def processStartTag(self, token): - self.flushCharacters() - self.parser.phase = self.originalPhase - return token - - def processEndTag(self, token): - self.flushCharacters() - self.parser.phase = self.originalPhase - return token - - class InCaptionPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-caption - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", - "thead", "tr"), self.startTagTableElement) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("caption", self.endTagCaption), - ("table", self.endTagTable), - (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", - "thead", "tr"), self.endTagIgnore) - ]) - self.endTagHandler.default = self.endTagOther - - def ignoreEndTagCaption(self): - return not self.tree.elementInScope("caption", variant="table") - - def processEOF(self): - self.parser.phases["inBody"].processEOF() - - def processCharacters(self, token): - return self.parser.phases["inBody"].processCharacters(token) - - def startTagTableElement(self, token): - self.parser.parseError() - # XXX Have to duplicate logic here to find out if the tag is ignored - ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag(impliedTagToken("caption")) - if not ignoreEndTag: - return token - - def startTagOther(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def endTagCaption(self, token): - if not self.ignoreEndTagCaption(): - # AT this code is quite similar to endTagTable in "InTable" - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != "caption": - self.parser.parseError("expected-one-end-tag-but-got-another", - {"gotName": "caption", - "expectedName": self.tree.openElements[-1].name}) - while self.tree.openElements[-1].name != "caption": - self.tree.openElements.pop() - self.tree.openElements.pop() - self.tree.clearActiveFormattingElements() - self.parser.phase = self.parser.phases["inTable"] - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def endTagTable(self, token): - self.parser.parseError() - ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag(impliedTagToken("caption")) - if not ignoreEndTag: - return token - - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - def endTagOther(self, token): - return self.parser.phases["inBody"].processEndTag(token) - - class InColumnGroupPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-column - - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("col", self.startTagCol) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("colgroup", self.endTagColgroup), - ("col", self.endTagCol) - ]) - self.endTagHandler.default = self.endTagOther - - def ignoreEndTagColgroup(self): - return self.tree.openElements[-1].name == "html" - - def processEOF(self): - if self.tree.openElements[-1].name == "html": - assert self.parser.innerHTML - return - else: - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup(impliedTagToken("colgroup")) - if not ignoreEndTag: - return True - - def processCharacters(self, token): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup(impliedTagToken("colgroup")) - if not ignoreEndTag: - return token - - def startTagCol(self, token): - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def startTagOther(self, token): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup(impliedTagToken("colgroup")) - if not ignoreEndTag: - return token - - def endTagColgroup(self, token): - if self.ignoreEndTagColgroup(): - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - else: - self.tree.openElements.pop() - self.parser.phase = self.parser.phases["inTable"] - - def endTagCol(self, token): - self.parser.parseError("no-end-tag", {"name": "col"}) - - def endTagOther(self, token): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup(impliedTagToken("colgroup")) - if not ignoreEndTag: - return token - - class InTableBodyPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("tr", self.startTagTr), - (("td", "th"), self.startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), - self.startTagTableOther) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), - ("table", self.endTagTable), - (("body", "caption", "col", "colgroup", "html", "td", "th", - "tr"), self.endTagIgnore) - ]) - self.endTagHandler.default = self.endTagOther - - # helper methods - def clearStackToTableBodyContext(self): - while self.tree.openElements[-1].name not in ("tbody", "tfoot", - "thead", "html"): - # self.parser.parseError("unexpected-implied-end-tag-in-table", - # {"name": self.tree.openElements[-1].name}) - self.tree.openElements.pop() - if self.tree.openElements[-1].name == "html": - assert self.parser.innerHTML - - # the rest - def processEOF(self): - self.parser.phases["inTable"].processEOF() - - def processSpaceCharacters(self, token): - return self.parser.phases["inTable"].processSpaceCharacters(token) - - def processCharacters(self, token): - return self.parser.phases["inTable"].processCharacters(token) - - def startTagTr(self, token): - self.clearStackToTableBodyContext() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inRow"] - - def startTagTableCell(self, token): - self.parser.parseError("unexpected-cell-in-table-body", - {"name": token["name"]}) - self.startTagTr(impliedTagToken("tr", "StartTag")) - return token - - def startTagTableOther(self, token): - # XXX AT Any ideas on how to share this with endTagTable? - if (self.tree.elementInScope("tbody", variant="table") or - self.tree.elementInScope("thead", variant="table") or - self.tree.elementInScope("tfoot", variant="table")): - self.clearStackToTableBodyContext() - self.endTagTableRowGroup( - impliedTagToken(self.tree.openElements[-1].name)) - return token - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def startTagOther(self, token): - return self.parser.phases["inTable"].processStartTag(token) - - def endTagTableRowGroup(self, token): - if self.tree.elementInScope(token["name"], variant="table"): - self.clearStackToTableBodyContext() - self.tree.openElements.pop() - self.parser.phase = self.parser.phases["inTable"] - else: - self.parser.parseError("unexpected-end-tag-in-table-body", - {"name": token["name"]}) - - def endTagTable(self, token): - if (self.tree.elementInScope("tbody", variant="table") or - self.tree.elementInScope("thead", variant="table") or - self.tree.elementInScope("tfoot", variant="table")): - self.clearStackToTableBodyContext() - self.endTagTableRowGroup( - impliedTagToken(self.tree.openElements[-1].name)) - return token - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag-in-table-body", - {"name": token["name"]}) - - def endTagOther(self, token): - return self.parser.phases["inTable"].processEndTag(token) - - class InRowPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-row - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("td", "th"), self.startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead", - "tr"), self.startTagTableOther) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("tr", self.endTagTr), - ("table", self.endTagTable), - (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), - (("body", "caption", "col", "colgroup", "html", "td", "th"), - self.endTagIgnore) - ]) - self.endTagHandler.default = self.endTagOther - - # helper methods (XXX unify this with other table helper methods) - def clearStackToTableRowContext(self): - while self.tree.openElements[-1].name not in ("tr", "html"): - self.parser.parseError("unexpected-implied-end-tag-in-table-row", - {"name": self.tree.openElements[-1].name}) - self.tree.openElements.pop() - - def ignoreEndTagTr(self): - return not self.tree.elementInScope("tr", variant="table") - - # the rest - def processEOF(self): - self.parser.phases["inTable"].processEOF() - - def processSpaceCharacters(self, token): - return self.parser.phases["inTable"].processSpaceCharacters(token) - - def processCharacters(self, token): - return self.parser.phases["inTable"].processCharacters(token) - - def startTagTableCell(self, token): - self.clearStackToTableRowContext() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inCell"] - self.tree.activeFormattingElements.append(Marker) - - def startTagTableOther(self, token): - ignoreEndTag = self.ignoreEndTagTr() - self.endTagTr(impliedTagToken("tr")) - # XXX how are we sure it's always ignored in the innerHTML case? - if not ignoreEndTag: - return token - - def startTagOther(self, token): - return self.parser.phases["inTable"].processStartTag(token) - - def endTagTr(self, token): - if not self.ignoreEndTagTr(): - self.clearStackToTableRowContext() - self.tree.openElements.pop() - self.parser.phase = self.parser.phases["inTableBody"] - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def endTagTable(self, token): - ignoreEndTag = self.ignoreEndTagTr() - self.endTagTr(impliedTagToken("tr")) - # Reprocess the current tag if the tr end tag was not ignored - # XXX how are we sure it's always ignored in the innerHTML case? - if not ignoreEndTag: - return token - - def endTagTableRowGroup(self, token): - if self.tree.elementInScope(token["name"], variant="table"): - self.endTagTr(impliedTagToken("tr")) - return token - else: - self.parser.parseError() - - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag-in-table-row", - {"name": token["name"]}) - - def endTagOther(self, token): - return self.parser.phases["inTable"].processEndTag(token) - - class InCellPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-cell - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", - "thead", "tr"), self.startTagTableOther) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - (("td", "th"), self.endTagTableCell), - (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), - (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) - ]) - self.endTagHandler.default = self.endTagOther - - # helper - def closeCell(self): - if self.tree.elementInScope("td", variant="table"): - self.endTagTableCell(impliedTagToken("td")) - elif self.tree.elementInScope("th", variant="table"): - self.endTagTableCell(impliedTagToken("th")) - - # the rest - def processEOF(self): - self.parser.phases["inBody"].processEOF() - - def processCharacters(self, token): - return self.parser.phases["inBody"].processCharacters(token) - - def startTagTableOther(self, token): - if (self.tree.elementInScope("td", variant="table") or - self.tree.elementInScope("th", variant="table")): - self.closeCell() - return token - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def startTagOther(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def endTagTableCell(self, token): - if self.tree.elementInScope(token["name"], variant="table"): - self.tree.generateImpliedEndTags(token["name"]) - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("unexpected-cell-end-tag", - {"name": token["name"]}) - while True: - node = self.tree.openElements.pop() - if node.name == token["name"]: - break - else: - self.tree.openElements.pop() - self.tree.clearActiveFormattingElements() - self.parser.phase = self.parser.phases["inRow"] - else: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - def endTagImply(self, token): - if self.tree.elementInScope(token["name"], variant="table"): - self.closeCell() - return token - else: - # sometimes innerHTML case - self.parser.parseError() - - def endTagOther(self, token): - return self.parser.phases["inBody"].processEndTag(token) - - class InSelectPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("option", self.startTagOption), - ("optgroup", self.startTagOptgroup), - ("select", self.startTagSelect), - (("input", "keygen", "textarea"), self.startTagInput), - ("script", self.startTagScript) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("option", self.endTagOption), - ("optgroup", self.endTagOptgroup), - ("select", self.endTagSelect) - ]) - self.endTagHandler.default = self.endTagOther - - # http://www.whatwg.org/specs/web-apps/current-work/#in-select - def processEOF(self): - if self.tree.openElements[-1].name != "html": - self.parser.parseError("eof-in-select") - else: - assert self.parser.innerHTML - - def processCharacters(self, token): - if token["data"] == "\u0000": - return - self.tree.insertText(token["data"]) - - def startTagOption(self, token): - # We need to imply </option> if <option> is the current node. - if self.tree.openElements[-1].name == "option": - self.tree.openElements.pop() - self.tree.insertElement(token) - - def startTagOptgroup(self, token): - if self.tree.openElements[-1].name == "option": - self.tree.openElements.pop() - if self.tree.openElements[-1].name == "optgroup": - self.tree.openElements.pop() - self.tree.insertElement(token) - - def startTagSelect(self, token): - self.parser.parseError("unexpected-select-in-select") - self.endTagSelect(impliedTagToken("select")) - - def startTagInput(self, token): - self.parser.parseError("unexpected-input-in-select") - if self.tree.elementInScope("select", variant="select"): - self.endTagSelect(impliedTagToken("select")) - return token - else: - assert self.parser.innerHTML - - def startTagScript(self, token): - return self.parser.phases["inHead"].processStartTag(token) - - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-in-select", - {"name": token["name"]}) - - def endTagOption(self, token): - if self.tree.openElements[-1].name == "option": - self.tree.openElements.pop() - else: - self.parser.parseError("unexpected-end-tag-in-select", - {"name": "option"}) - - def endTagOptgroup(self, token): - # </optgroup> implicitly closes <option> - if (self.tree.openElements[-1].name == "option" and - self.tree.openElements[-2].name == "optgroup"): - self.tree.openElements.pop() - # It also closes </optgroup> - if self.tree.openElements[-1].name == "optgroup": - self.tree.openElements.pop() - # But nothing else - else: - self.parser.parseError("unexpected-end-tag-in-select", - {"name": "optgroup"}) - - def endTagSelect(self, token): - if self.tree.elementInScope("select", variant="select"): - node = self.tree.openElements.pop() - while node.name != "select": - node = self.tree.openElements.pop() - self.parser.resetInsertionMode() - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-in-select", - {"name": token["name"]}) - - class InSelectInTablePhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), - self.startTagTable) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), - self.endTagTable) - ]) - self.endTagHandler.default = self.endTagOther - - def processEOF(self): - self.parser.phases["inSelect"].processEOF() - - def processCharacters(self, token): - return self.parser.phases["inSelect"].processCharacters(token) - - def startTagTable(self, token): - self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) - self.endTagOther(impliedTagToken("select")) - return token - - def startTagOther(self, token): - return self.parser.phases["inSelect"].processStartTag(token) - - def endTagTable(self, token): - self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) - if self.tree.elementInScope(token["name"], variant="table"): - self.endTagOther(impliedTagToken("select")) - return token - - def endTagOther(self, token): - return self.parser.phases["inSelect"].processEndTag(token) - - class InForeignContentPhase(Phase): - breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", - "center", "code", "dd", "div", "dl", "dt", - "em", "embed", "h1", "h2", "h3", - "h4", "h5", "h6", "head", "hr", "i", "img", - "li", "listing", "menu", "meta", "nobr", - "ol", "p", "pre", "ruby", "s", "small", - "span", "strong", "strike", "sub", "sup", - "table", "tt", "u", "ul", "var"]) - - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - def adjustSVGTagNames(self, token): - replacements = {"altglyph": "altGlyph", - "altglyphdef": "altGlyphDef", - "altglyphitem": "altGlyphItem", - "animatecolor": "animateColor", - "animatemotion": "animateMotion", - "animatetransform": "animateTransform", - "clippath": "clipPath", - "feblend": "feBlend", - "fecolormatrix": "feColorMatrix", - "fecomponenttransfer": "feComponentTransfer", - "fecomposite": "feComposite", - "feconvolvematrix": "feConvolveMatrix", - "fediffuselighting": "feDiffuseLighting", - "fedisplacementmap": "feDisplacementMap", - "fedistantlight": "feDistantLight", - "feflood": "feFlood", - "fefunca": "feFuncA", - "fefuncb": "feFuncB", - "fefuncg": "feFuncG", - "fefuncr": "feFuncR", - "fegaussianblur": "feGaussianBlur", - "feimage": "feImage", - "femerge": "feMerge", - "femergenode": "feMergeNode", - "femorphology": "feMorphology", - "feoffset": "feOffset", - "fepointlight": "fePointLight", - "fespecularlighting": "feSpecularLighting", - "fespotlight": "feSpotLight", - "fetile": "feTile", - "feturbulence": "feTurbulence", - "foreignobject": "foreignObject", - "glyphref": "glyphRef", - "lineargradient": "linearGradient", - "radialgradient": "radialGradient", - "textpath": "textPath"} - - if token["name"] in replacements: - token["name"] = replacements[token["name"]] - - def processCharacters(self, token): - if token["data"] == "\u0000": - token["data"] = "\uFFFD" - elif (self.parser.framesetOK and - any(char not in spaceCharacters for char in token["data"])): - self.parser.framesetOK = False - Phase.processCharacters(self, token) - - def processStartTag(self, token): - currentNode = self.tree.openElements[-1] - if (token["name"] in self.breakoutElements or - (token["name"] == "font" and - set(token["data"].keys()) & set(["color", "face", "size"]))): - self.parser.parseError("unexpected-html-element-in-foreign-content", - {"name": token["name"]}) - while (self.tree.openElements[-1].namespace != - self.tree.defaultNamespace and - not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and - not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): - self.tree.openElements.pop() - return token - - else: - if currentNode.namespace == namespaces["mathml"]: - self.parser.adjustMathMLAttributes(token) - elif currentNode.namespace == namespaces["svg"]: - self.adjustSVGTagNames(token) - self.parser.adjustSVGAttributes(token) - self.parser.adjustForeignAttributes(token) - token["namespace"] = currentNode.namespace - self.tree.insertElement(token) - if token["selfClosing"]: - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def processEndTag(self, token): - nodeIndex = len(self.tree.openElements) - 1 - node = self.tree.openElements[-1] - if node.name.translate(asciiUpper2Lower) != token["name"]: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - while True: - if node.name.translate(asciiUpper2Lower) == token["name"]: - # XXX this isn't in the spec but it seems necessary - if self.parser.phase == self.parser.phases["inTableText"]: - self.parser.phase.flushCharacters() - self.parser.phase = self.parser.phase.originalPhase - while self.tree.openElements.pop() != node: - assert self.tree.openElements - new_token = None - break - nodeIndex -= 1 - - node = self.tree.openElements[nodeIndex] - if node.namespace != self.tree.defaultNamespace: - continue - else: - new_token = self.parser.phase.processEndTag(token) - break - return new_token - - class AfterBodyPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)]) - self.endTagHandler.default = self.endTagOther - - def processEOF(self): - # Stop parsing - pass - - def processComment(self, token): - # This is needed because data is to be appended to the <html> element - # here and not to whatever is currently open. - self.tree.insertComment(token, self.tree.openElements[0]) - - def processCharacters(self, token): - self.parser.parseError("unexpected-char-after-body") - self.parser.phase = self.parser.phases["inBody"] - return token - - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-after-body", - {"name": token["name"]}) - self.parser.phase = self.parser.phases["inBody"] - return token - - def endTagHtml(self, name): - if self.parser.innerHTML: - self.parser.parseError("unexpected-end-tag-after-body-innerhtml") - else: - self.parser.phase = self.parser.phases["afterAfterBody"] - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-after-body", - {"name": token["name"]}) - self.parser.phase = self.parser.phases["inBody"] - return token - - class InFramesetPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("frameset", self.startTagFrameset), - ("frame", self.startTagFrame), - ("noframes", self.startTagNoframes) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("frameset", self.endTagFrameset) - ]) - self.endTagHandler.default = self.endTagOther - - def processEOF(self): - if self.tree.openElements[-1].name != "html": - self.parser.parseError("eof-in-frameset") - else: - assert self.parser.innerHTML - - def processCharacters(self, token): - self.parser.parseError("unexpected-char-in-frameset") - - def startTagFrameset(self, token): - self.tree.insertElement(token) - - def startTagFrame(self, token): - self.tree.insertElement(token) - self.tree.openElements.pop() - - def startTagNoframes(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-in-frameset", - {"name": token["name"]}) - - def endTagFrameset(self, token): - if self.tree.openElements[-1].name == "html": - # innerHTML case - self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") - else: - self.tree.openElements.pop() - if (not self.parser.innerHTML and - self.tree.openElements[-1].name != "frameset"): - # If we're not in innerHTML mode and the current node is not a - # "frameset" element (anymore) then switch. - self.parser.phase = self.parser.phases["afterFrameset"] - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-in-frameset", - {"name": token["name"]}) - - class AfterFramesetPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#after3 - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("noframes", self.startTagNoframes) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("html", self.endTagHtml) - ]) - self.endTagHandler.default = self.endTagOther - - def processEOF(self): - # Stop parsing - pass - - def processCharacters(self, token): - self.parser.parseError("unexpected-char-after-frameset") - - def startTagNoframes(self, token): - return self.parser.phases["inHead"].processStartTag(token) - - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-after-frameset", - {"name": token["name"]}) - - def endTagHtml(self, token): - self.parser.phase = self.parser.phases["afterAfterFrameset"] - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-after-frameset", - {"name": token["name"]}) - - class AfterAfterBodyPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml) - ]) - self.startTagHandler.default = self.startTagOther - - def processEOF(self): - pass - - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) - - def processSpaceCharacters(self, token): - return self.parser.phases["inBody"].processSpaceCharacters(token) - - def processCharacters(self, token): - self.parser.parseError("expected-eof-but-got-char") - self.parser.phase = self.parser.phases["inBody"] - return token - - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def startTagOther(self, token): - self.parser.parseError("expected-eof-but-got-start-tag", - {"name": token["name"]}) - self.parser.phase = self.parser.phases["inBody"] - return token - - def processEndTag(self, token): - self.parser.parseError("expected-eof-but-got-end-tag", - {"name": token["name"]}) - self.parser.phase = self.parser.phases["inBody"] - return token - - class AfterAfterFramesetPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("noframes", self.startTagNoFrames) - ]) - self.startTagHandler.default = self.startTagOther - - def processEOF(self): - pass - - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) - - def processSpaceCharacters(self, token): - return self.parser.phases["inBody"].processSpaceCharacters(token) - - def processCharacters(self, token): - self.parser.parseError("expected-eof-but-got-char") - - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) - - def startTagNoFrames(self, token): - return self.parser.phases["inHead"].processStartTag(token) - - def startTagOther(self, token): - self.parser.parseError("expected-eof-but-got-start-tag", - {"name": token["name"]}) - - def processEndTag(self, token): - self.parser.parseError("expected-eof-but-got-end-tag", - {"name": token["name"]}) - # pylint:enable=unused-argument - - return { - "initial": InitialPhase, - "beforeHtml": BeforeHtmlPhase, - "beforeHead": BeforeHeadPhase, - "inHead": InHeadPhase, - "inHeadNoscript": InHeadNoscriptPhase, - "afterHead": AfterHeadPhase, - "inBody": InBodyPhase, - "text": TextPhase, - "inTable": InTablePhase, - "inTableText": InTableTextPhase, - "inCaption": InCaptionPhase, - "inColumnGroup": InColumnGroupPhase, - "inTableBody": InTableBodyPhase, - "inRow": InRowPhase, - "inCell": InCellPhase, - "inSelect": InSelectPhase, - "inSelectInTable": InSelectInTablePhase, - "inForeignContent": InForeignContentPhase, - "afterBody": AfterBodyPhase, - "inFrameset": InFramesetPhase, - "afterFrameset": AfterFramesetPhase, - "afterAfterBody": AfterAfterBodyPhase, - "afterAfterFrameset": AfterAfterFramesetPhase, - # XXX after after frameset - } - - -def adjust_attributes(token, replacements): - needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) - if needs_adjustment: - token['data'] = OrderedDict((replacements.get(k, k), v) - for k, v in token['data'].items()) - - -def impliedTagToken(name, type="EndTag", attributes=None, - selfClosing=False): - if attributes is None: - attributes = {} - return {"type": tokenTypes[type], "name": name, "data": attributes, - "selfClosing": selfClosing} - - -class ParseError(Exception): - """Error in parsed document""" - pass |