"""
self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment()
def parseError(self, errorcode="XXX-undefined-error", datavars=None):
# XXX The idea is to make errorcode mandatory.
if datavars is None:
datavars = {}
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict:
raise ParseError(E[errorcode] % datavars)
def normalizeToken(self, token):
# HTML5 specific normalizations to the token stream
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
token["data"] = OrderedDict(raw)
if len(raw) > len(token["data"]):
# we had some duplicated attribute, fix so first wins
token["data"].update(raw[::-1])
return token
def adjustMathMLAttributes(self, token):
adjust_attributes(token, adjustMathMLAttributes)
def adjustSVGAttributes(self, token):
adjust_attributes(token, adjustSVGAttributes)
def adjustForeignAttributes(self, token):
adjust_attributes(token, adjustForeignAttributesMap)
def reparseTokenNormal(self, token):
# pylint:disable=unused-argument
self.parser.phase()
def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = False
newModes = {
"select": "inSelect",
"td": "inCell",
"th": "inCell",
"tr": "inRow",
"tbody": "inTableBody",
"thead": "inTableBody",
"tfoot": "inTableBody",
"caption": "inCaption",
"colgroup": "inColumnGroup",
"table": "inTable",
"head": "inBody",
"body": "inBody",
"frameset": "inFrameset",
"html": "beforeHead"
}
for node in self.tree.openElements[::-1]:
nodeName = node.name
new_phase = None
if node == self.tree.openElements[0]:
assert self.innerHTML
last = True
nodeName = self.innerHTML
# Check for conditions that should only happen in the innerHTML
# case
if nodeName in ("select", "colgroup", "head", "html"):
assert self.innerHTML
if not last and node.namespace != self.tree.defaultNamespace:
continue
if nodeName in newModes:
new_phase = self.phases[newModes[nodeName]]
break
elif last:
new_phase = self.phases["inBody"]
break
self.phase = new_phase
def parseRCDataRawtext(self, token, contentType):
# Generic RCDATA/RAWTEXT Parsing algorithm
assert contentType in ("RAWTEXT", "RCDATA")
self.tree.insertElement(token)
if contentType == "RAWTEXT":
self.tokenizer.state = self.tokenizer.rawtextState
else:
self.tokenizer.state = self.tokenizer.rcdataState
self.originalPhase = self.phase
self.phase = self.phases["text"]
@_utils.memoize
def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = dict((value, key) for key, value in
tokenTypes.items())
def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
token = args[0]
try:
info = {"type": type_names[token['type']]}
except:
raise
if token['type'] in tagTokenTypes:
info["name"] = token['name']
self.parser.log.append((self.parser.tokenizer.state.__name__,
self.parser.phase.__class__.__name__,
self.__class__.__name__,
function.__name__,
info))
return function(self, *args, **kwargs)
else:
return function(self, *args, **kwargs)
return wrapped
def getMetaclass(use_metaclass, metaclass_func):
if use_metaclass:
return method_decorator_metaclass(metaclass_func)
else:
return type
# pylint:disable=unused-argument
class Phase(with_metaclass(getMetaclass(debug, log))):
"""Base class for helper object that implements each phase of processing
"""
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
def processEOF(self):
raise NotImplementedError
def processComment(self, token):
# For most phases the following is correct. Where it's not it will be
# overridden.
self.tree.insertComment(token, self.tree.openElements[-1])
def processDoctype(self, token):
self.parser.parseError("unexpected-doctype")
def processCharacters(self, token):
self.tree.insertText(token["data"])
def processSpaceCharacters(self, token):
self.tree.insertText(token["data"])
def processStartTag(self, token):
return self.startTagHandler[token["name"]](token)
def startTagHtml(self, token):
if not self.parser.firstStartTag and token["name"] == "html":
self.parser.parseError("non-html-root")
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in token["data"].items():
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
def processEndTag(self, token):
return self.endTagHandler[token["name"]](token)
class InitialPhase(Phase):
def processSpaceCharacters(self, token):
pass
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
correct = token["correct"]
if (name != "html" or publicId is not None or
systemId is not None and systemId != "about:legacy-compat"):
self.parser.parseError("unknown-doctype")
if publicId is None:
publicId = ""
self.tree.insertDoctype(token)
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
if (not correct or token["name"] != "html" or
publicId.startswith(
("+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//",
"-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//")) or
publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
"-/w3c/dtd html 4.0 transitional/en",
"html") or
publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is None or
systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks"
elif (publicId.startswith(
("-//w3c//dtd xhtml 1.0 frameset//",
"-//w3c//dtd xhtml 1.0 transitional//")) or
publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is not None):
self.parser.compatMode = "limited quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
def anythingElse(self):
self.parser.compatMode = "quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
def processCharacters(self, token):
self.parser.parseError("expected-doctype-but-got-chars")
self.anythingElse()
return token
def processStartTag(self, token):
self.parser.parseError("expected-doctype-but-got-start-tag",
{"name": token["name"]})
self.anythingElse()
return token
def processEndTag(self, token):
self.parser.parseError("expected-doctype-but-got-end-tag",
{"name": token["name"]})
self.anythingElse()
return token
def processEOF(self):
self.parser.parseError("expected-doctype-but-got-eof")
self.anythingElse()
return True
class BeforeHtmlPhase(Phase):
# helper methods
def insertHtmlElement(self):
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
self.parser.phase = self.parser.phases["beforeHead"]
# other
def processEOF(self):
self.insertHtmlElement()
return True
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processSpaceCharacters(self, token):
pass
def processCharacters(self, token):
self.insertHtmlElement()
return token
def processStartTag(self, token):
if token["name"] == "html":
self.parser.firstStartTag = True
self.insertHtmlElement()
return token
def processEndTag(self, token):
if token["name"] not in ("head", "body", "html", "br"):
self.parser.parseError("unexpected-end-tag-before-html",
{"name": token["name"]})
else:
self.insertHtmlElement()
return token
class BeforeHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.startTagHead(impliedTagToken("head", "StartTag"))
return True
def processSpaceCharacters(self, token):
pass
def processCharacters(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagHead(self, token):
self.tree.insertElement(token)
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
def startTagOther(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def endTagImplyHead(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def endTagOther(self, token):
self.parser.parseError("end-tag-after-implied-root",
{"name": token["name"]})
class InHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("title", self.startTagTitle),
(("noframes", "style"), self.startTagNoFramesStyle),
("noscript", self.startTagNoscript),
("script", self.startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
self.startTagBaseLinkCommand),
("meta", self.startTagMeta),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("head", self.endTagHead),
(("br", "html", "body"), self.endTagHtmlBodyBr)
])
self.endTagHandler.default = self.endTagOther
# the real thing
def processEOF(self):
self.anythingElse()
return True
def processCharacters(self, token):
self.anythingElse()
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagHead(self, token):
self.parser.parseError("two-heads-are-not-better-than-one")
def startTagBaseLinkCommand(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagMeta(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
attributes = token["data"]
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
if "charset" in attributes:
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
elif ("content" in attributes and
"http-equiv" in attributes and
attributes["http-equiv"].lower() == "content-type"):
# Encoding it as UTF-8 here is a hack, as really we should pass
# the abstract Unicode string, and just use the
# ContentAttrParser on that, but using UTF-8 allows all chars
# to be encoded and as a ASCII-superset works.
data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = _inputstream.ContentAttrParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token):
self.parser.parseRCDataRawtext(token, "RCDATA")
def startTagNoFramesStyle(self, token):
# Need to decide whether to implement the scripting-disabled case
self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagNoscript(self, token):
if self.parser.scripting:
self.parser.parseRCDataRawtext(token, "RAWTEXT")
else:
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inHeadNoscript"]
def startTagScript(self, token):
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
self.parser.originalPhase = self.parser.phase
self.parser.phase = self.parser.phases["text"]
def startTagOther(self, token):
self.anythingElse()
return token
def endTagHead(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "head", "Expected head got %s" % node.name
self.parser.phase = self.parser.phases["afterHead"]
def endTagHtmlBodyBr(self, token):
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
self.endTagHead(impliedTagToken("head"))
class InHeadNoscriptPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
(("head", "noscript"), self.startTagHeadNoscript),
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("noscript", self.endTagNoscript),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.parser.parseError("eof-in-head-noscript")
self.anythingElse()
return True
def processComment(self, token):
return self.parser.phases["inHead"].processComment(token)
def processCharacters(self, token):
self.parser.parseError("char-in-head-noscript")
self.anythingElse()
return token
def processSpaceCharacters(self, token):
return self.parser.phases["inHead"].processSpaceCharacters(token)
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBaseLinkCommand(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagHeadNoscript(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagNoscript(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "noscript", "Expected noscript got %s" % node.name
self.parser.phase = self.parser.phases["inHead"]
def endTagBr(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
# Caller must raise parse error first!
self.endTagNoscript(impliedTagToken("noscript"))
class AfterHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"),
self.startTagFromHead),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
self.endTagHtmlBodyBr)])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.anythingElse()
return True
def processCharacters(self, token):
self.anythingElse()
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBody(self, token):
self.parser.framesetOK = False
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inBody"]
def startTagFrameset(self, token):
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, token):
self.parser.parseError("unexpected-start-tag-out-of-my-head",
{"name": token["name"]})
self.tree.openElements.append(self.tree.headPointer)
self.parser.phases["inHead"].processStartTag(token)
for node in self.tree.openElements[::-1]:
if node.name == "head":
self.tree.openElements.remove(node)
break
def startTagHead(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.anythingElse()
return token
def endTagHtmlBodyBr(self, token):
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
self.tree.insertElement(impliedTagToken("body", "StartTag"))
self.parser.phase = self.parser.phases["inBody"]
self.parser.framesetOK = True
class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# the really-really-really-very crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
# Set this to the default handler
self.processSpaceCharacters = self.processSpaceCharactersNonPre
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"),
self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details",
"dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"),
self.startTagCloseP),
(headingElements, self.startTagHeading),
(("pre", "listing"), self.startTagPreListing),
("form", self.startTagForm),
(("li", "dd", "dt"), self.startTagListItem),
("plaintext", self.startTagPlaintext),
("a", self.startTagA),
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
"strong", "tt", "u"), self.startTagFormatting),
("nobr", self.startTagNobr),
("button", self.startTagButton),
(("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
("xmp", self.startTagXmp),
("table", self.startTagTable),
(("area", "br", "embed", "img", "keygen", "wbr"),
self.startTagVoidFormatting),
(("param", "source", "track"), self.startTagParamSource),
("input", self.startTagInput),
("hr", self.startTagHr),
("image", self.startTagImage),
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
("iframe", self.startTagIFrame),
("noscript", self.startTagNoscript),
(("noembed", "noframes"), self.startTagRawtext),
("select", self.startTagSelect),
(("rp", "rt"), self.startTagRpRt),
(("option", "optgroup"), self.startTagOpt),
(("math"), self.startTagMath),
(("svg"), self.startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
"tr"), self.startTagMisplaced)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("body", self.endTagBody),
("html", self.endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center",
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
"section", "summary", "ul"), self.endTagBlock),
("form", self.endTagForm),
("p", self.endTagP),
(("dd", "dt", "li"), self.endTagListItem),
(headingElements, self.endTagHeading),
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), self.endTagFormatting),
(("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
def isMatchingFormattingElement(self, node1, node2):
return (node1.name == node2.name and
node1.namespace == node2.namespace and
node1.attributes == node2.attributes)
# helper
def addFormattingElement(self, token):
self.tree.insertElement(token)
element = self.tree.openElements[-1]
matchingElements = []
for node in self.tree.activeFormattingElements[::-1]:
if node is Marker:
break
elif self.isMatchingFormattingElement(node, element):
matchingElements.append(node)
assert len(matchingElements) <= 3
if len(matchingElements) == 3:
self.tree.activeFormattingElements.remove(matchingElements[-1])
self.tree.activeFormattingElements.append(element)
# the real deal
def processEOF(self):
allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
"tfoot", "th", "thead", "tr", "body",
"html"))
for node in self.tree.openElements[::-1]:
if node.name not in allowed_elements:
self.parser.parseError("expected-closing-tag-but-got-eof")
break
# Stop parsing
def processSpaceCharactersDropNewline(self, token):
# Sometimes (start of , , and