diff options
author | Renard | 2020-03-29 18:43:36 -0300 |
---|---|---|
committer | Renard | 2020-03-29 18:43:36 -0300 |
commit | 56c690b9efdb009ab44f3112b6c301d7d393f07e (patch) | |
tree | b2a28666888df9b60b46b6d1c59dd3818437b405 /cgi/BeautifulSoup.py | |
parent | 775ef3e6291c5ad6bff68a12f6ca81c8663da3dc (diff) | |
download | weabot-56c690b9efdb009ab44f3112b6c301d7d393f07e.tar.gz weabot-56c690b9efdb009ab44f3112b6c301d7d393f07e.tar.xz weabot-56c690b9efdb009ab44f3112b6c301d7d393f07e.zip |
Formateo de python con pep8
Diffstat (limited to 'cgi/BeautifulSoup.py')
-rw-r--r-- | cgi/BeautifulSoup.py | 480 |
1 files changed, 255 insertions, 225 deletions
diff --git a/cgi/BeautifulSoup.py b/cgi/BeautifulSoup.py index 7278215..3e97785 100644 --- a/cgi/BeautifulSoup.py +++ b/cgi/BeautifulSoup.py @@ -90,26 +90,28 @@ import types import re import sgmllib try: - from htmlentitydefs import name2codepoint + from htmlentitydefs import name2codepoint except ImportError: - name2codepoint = {} + name2codepoint = {} try: set except NameError: from sets import Set as set -#These hacks make Beautiful Soup able to parse XML with namespaces +# These hacks make Beautiful Soup able to parse XML with namespaces sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match DEFAULT_OUTPUT_ENCODING = "utf-8" + def _match_css_class(str): """Build a RE to match the given CSS class.""" return re.compile(r"(^|.*\s)%s($|\s)" % str) # First, the classes that represent markup elements. + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -117,15 +119,15 @@ class PageElement(object): def _invert(h): "Cheap function to invert a hash." i = {} - for k,v in h.items(): + for k, v in h.items(): i[v] = k return i - XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", - "quot" : '"', - "amp" : "&", - "lt" : "<", - "gt" : ">" } + XML_ENTITIES_TO_SPECIAL_CHARS = {"apos": "'", + "quot": '"', + "amp": "&", + "lt": "<", + "gt": ">"} XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) @@ -145,7 +147,7 @@ class PageElement(object): oldParent = self.parent myIndex = self.parent.index(self) if hasattr(replaceWith, "parent")\ - and replaceWith.parent is self.parent: + and replaceWith.parent is self.parent: # We're replacing this element with one of its siblings. index = replaceWith.parent.index(replaceWith) if index and index < myIndex: @@ -173,9 +175,9 @@ class PageElement(object): except ValueError: pass - #Find the two elements that would be next to each other if - #this element (and any children) hadn't been parsed. Connect - #the two. + # Find the two elements that would be next to each other if + # this element (and any children) hadn't been parsed. Connect + # the two. lastChild = self._lastRecursiveChild() nextElement = lastChild.next @@ -203,10 +205,10 @@ class PageElement(object): def insert(self, position, newChild): if isinstance(newChild, basestring) \ - and not isinstance(newChild, NavigableString): + and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) - position = min(position, len(self.contents)) + position = min(position, len(self.contents)) if hasattr(newChild, 'parent') and newChild.parent is not None: # We're 'inserting' an element that's already one # of this object's children. @@ -243,7 +245,7 @@ class PageElement(object): while not parentsNextSibling: parentsNextSibling = parent.nextSibling parent = parent.parent - if not parent: # This is the last element in the document. + if not parent: # This is the last element in the document. break if parentsNextSibling: newChildsLastElement.next = parentsNextSibling @@ -288,7 +290,7 @@ class PageElement(object): criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.nextSiblingGenerator, **kwargs) - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x def findPrevious(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and @@ -300,8 +302,8 @@ class PageElement(object): """Returns all items that match the given criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousGenerator, - **kwargs) - fetchPrevious = findAllPrevious # Compatibility with pre-3.x + **kwargs) + fetchPrevious = findAllPrevious # Compatibility with pre-3.x def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the @@ -315,7 +317,7 @@ class PageElement(object): criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x def findParent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given @@ -334,9 +336,9 @@ class PageElement(object): return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs) - fetchParents = findParents # Compatibility with pre-3.x + fetchParents = findParents # Compatibility with pre-3.x - #These methods do the real heavy lifting. + # These methods do the real heavy lifting. def _findOne(self, method, name, attrs, text, **kwargs): r = None @@ -381,8 +383,8 @@ class PageElement(object): break return results - #These Generators can be used to navigate starting from both - #NavigableStrings and Tags. + # These Generators can be used to navigate starting from both + # NavigableStrings and Tags. def nextGenerator(self): i = self while i is not None: @@ -431,7 +433,7 @@ class PageElement(object): s = unicode(s) else: if encoding: - s = self.toEncoding(str(s), encoding) + s = self.toEncoding(str(s), encoding) else: s = unicode(s) return s @@ -483,11 +485,13 @@ class NavigableString(unicode, PageElement): else: return data + class CData(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) + class ProcessingInstruction(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): output = self @@ -495,14 +499,17 @@ class ProcessingInstruction(NavigableString): output = self.substituteEncoding(output, encoding) return "<?%s?>" % self.toEncoding(output, encoding) + class Comment(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<!--%s-->" % NavigableString.__str__(self, encoding) + class Declaration(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<!%s>" % NavigableString.__str__(self, encoding) + class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" @@ -555,15 +562,15 @@ class Tag(PageElement): self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. - convert = lambda(k, val): (k, - re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", - self._convertEntities, - val)) + def convert((k, val)): return (k, + re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", + self._convertEntities, + val)) self.attrs = map(convert, self.attrs) def getString(self): if (len(self.contents) == 1 - and isinstance(self.contents[0], NavigableString)): + and isinstance(self.contents[0], NavigableString)): return self.contents[0] def setString(self, string): @@ -646,8 +653,8 @@ class Tag(PageElement): for item in self.attrs: if item[0] == key: self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. + # We don't break because bad HTML can define the same + # attribute multiple times. self._getAttrMap() if self.attrMap.has_key(key): del self.attrMap[key] @@ -659,7 +666,7 @@ class Tag(PageElement): return apply(self.findAll, args, kwargs) def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) + # print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: return self.find(tag[:-3]) elif tag.find('__') != 0: @@ -738,7 +745,8 @@ class Tag(PageElement): # value might also contain angle brackets, or # ampersands that aren't part of entities. We need # to escape those to XML entities too. - val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) + val = self.BARE_AMPERSAND_OR_BRACKET.sub( + self._sub_entity, val) attrs.append(fmt % (self.toEncoding(key, encoding), self.toEncoding(val, encoding))) @@ -802,7 +810,7 @@ class Tag(PageElement): prettyPrint=False, indentLevel=0): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" - s=[] + s = [] for c in self: text = None if isinstance(c, NavigableString): @@ -819,7 +827,7 @@ class Tag(PageElement): s.append("\n") return ''.join(s) - #Soup methods + # Soup methods def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): @@ -859,7 +867,7 @@ class Tag(PageElement): def firstText(self, text=None, recursive=True): return self.find(text=text, recursive=recursive) - #Private methods + # Private methods def _getAttrMap(self): """Initializes a map representation of this tag's attributes, @@ -870,7 +878,7 @@ class Tag(PageElement): self.attrMap[key] = value return self.attrMap - #Generator methods + # Generator methods def childGenerator(self): # Just use the iterator from the contents return iter(self.contents) @@ -917,12 +925,12 @@ class SoupStrainer: markup = markupName markupAttrs = markup callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) + and not isinstance(markupName, Tag) if (not self.name) \ - or callFunctionWithTagData \ - or (markup and self._matches(markup, self.name)) \ - or (not markup and self._matches(markupName, self.name)): + or callFunctionWithTagData \ + or (markup and self._matches(markup, self.name)) \ + or (not markup and self._matches(markupName, self.name)): if callFunctionWithTagData: match = self.name(markupName, markupAttrs) else: @@ -930,11 +938,11 @@ class SoupStrainer: markupAttrMap = None for attr, matchAgainst in self.attrs.items(): if not markupAttrMap: - if hasattr(markupAttrs, 'get'): + if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs - else: + else: markupAttrMap = {} - for k,v in markupAttrs: + for k, v in markupAttrs: markupAttrMap[k] = v attrValue = markupAttrMap.get(attr) if not self._matches(attrValue, matchAgainst): @@ -948,7 +956,7 @@ class SoupStrainer: return found def search(self, markup): - #print 'looking for %s in %s' % (self, markup) + # print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. @@ -956,7 +964,7 @@ class SoupStrainer: and not isinstance(markup, Tag): for element in markup: if isinstance(element, NavigableString) \ - and self.search(element): + and self.search(element): found = element break # If it's a Tag, make sure its name or attributes match. @@ -966,33 +974,33 @@ class SoupStrainer: found = self.searchTag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + isinstance(markup, basestring): if self._matches(markup, self.text): found = markup else: raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ + % markup.__class__ return found def _matches(self, markup, matchAgainst): - #print "Matching %s against %s" % (markup, matchAgainst) + # print "Matching %s against %s" % (markup, matchAgainst) result = False if matchAgainst is True: result = markup is not None elif callable(matchAgainst): result = matchAgainst(markup) else: - #Custom match methods take the tag as an argument, but all - #other ways of matching match the tag name as a string. + # Custom match methods take the tag as an argument, but all + # other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name if markup and not isinstance(markup, basestring): markup = unicode(markup) - #Now we know that chunk is either a string, or None. + # Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) - elif hasattr(matchAgainst, '__iter__'): # list-like + elif hasattr(matchAgainst, '__iter__'): # list-like result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) @@ -1006,15 +1014,18 @@ class SoupStrainer: result = matchAgainst == markup return result + class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" + def __init__(self, source): list.__init__([]) self.source = source # Now, some helper functions. + def buildTagMap(default, *args): """Turns a list of maps, lists, or scalars into a single map. Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and @@ -1022,20 +1033,21 @@ def buildTagMap(default, *args): built = {} for portion in args: if hasattr(portion, 'items'): - #It's a map. Merge it. - for k,v in portion.items(): + # It's a map. Merge it. + for k, v in portion.items(): built[k] = v - elif hasattr(portion, '__iter__'): # is a list - #It's a list. Map each item to the default. + elif hasattr(portion, '__iter__'): # is a list + # It's a list. Map each item to the default. for k in portion: built[k] = default else: - #It's a scalar. Map it to the default. + # It's a scalar. Map it to the default. built[portion] = default return built # Now, the parser classes. + class BeautifulStoneSoup(Tag, SGMLParser): """This class contains the basic parser and search code. It defines @@ -1078,7 +1090,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. - STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } + STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, @@ -1155,7 +1167,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): n = int(name) except ValueError: return - if not 0 <= n <= 127 : # ASCII ends at 127, not 255 + if not 0 <= n <= 127: # ASCII ends at 127, not 255 return return self.convert_codepoint(n) @@ -1166,9 +1178,8 @@ class BeautifulStoneSoup(Tag, SGMLParser): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: - dammit = UnicodeDammit\ - (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) + dammit = UnicodeDammit(markup, [self.fromEncoding, inDocumentEncoding], + smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding @@ -1195,10 +1206,10 @@ class BeautifulStoneSoup(Tag, SGMLParser): def __getattr__(self, methodName): """This method routes method call requests to either the SGMLParser superclass or the Tag superclass, depending on the method name.""" - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) + # print "__getattr__ called on %s.%s" % (self.__class__, methodName) if methodName.startswith('start_') or methodName.startswith('end_') \ - or methodName.startswith('do_'): + or methodName.startswith('do_'): return SGMLParser.__getattr__(self, methodName) elif not methodName.startswith('__'): return Tag.__getattr__(self, methodName) @@ -1209,7 +1220,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) + or self.instanceSelfClosingTags.has_key(name) def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) @@ -1224,13 +1235,13 @@ class BeautifulStoneSoup(Tag, SGMLParser): def popTag(self): tag = self.tagStack.pop() - #print "Pop", tag.name + # print "Pop", tag.name if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): - #print "Push", tag.name + # print "Push", tag.name if self.currentTag: self.currentTag.contents.append(tag) self.tagStack.append(tag) @@ -1248,7 +1259,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): currentData = ' ' self.currentData = [] if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ + (not self.parseOnlyThese.text or not self.parseOnlyThese.search(currentData)): return o = containerClass(currentData) @@ -1258,13 +1269,12 @@ class BeautifulStoneSoup(Tag, SGMLParser): self.previous = o self.currentTag.contents.append(o) - def _popToTag(self, name, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.""" - #print "Popping to %s" % name + # print "Popping to %s" % name if name == self.ROOT_TAG_NAME: return @@ -1282,7 +1292,6 @@ class BeautifulStoneSoup(Tag, SGMLParser): return mostRecentTag def _smartPop(self, name): - """We need to pop up to the previous tag of this type, unless one of this tag's nesting reset triggers comes between this tag and the previous tag of this type, OR unless this tag is a @@ -1307,8 +1316,8 @@ class BeautifulStoneSoup(Tag, SGMLParser): for i in range(len(self.tagStack)-1, 0, -1): p = self.tagStack[i] if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. + # Non-nestable tags get popped to the top or to their + # last occurance. popTo = name break if (nestingResetTriggers is not None @@ -1316,10 +1325,10 @@ class BeautifulStoneSoup(Tag, SGMLParser): or (nestingResetTriggers is None and isResetNesting and self.RESET_NESTING_TAGS.has_key(p.name)): - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. + # If we encounter one of the nesting reset triggers + # peculiar to this tag, or we encounter another tag + # that causes nesting to reset, pop up to but not + # including that tag. popTo = p.name inclusive = False break @@ -1328,10 +1337,10 @@ class BeautifulStoneSoup(Tag, SGMLParser): self._popToTag(popTo, inclusive) def unknown_starttag(self, name, attrs, selfClosing=0): - #print "Start tag %s: %s" % (name, attrs) + # print "Start tag %s: %s" % (name, attrs) if self.quoteStack: - #This is not a real tag. - #print "<%s> is not real!" % name + # This is not a real tag. + # print "<%s> is not real!" % name attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) self.handle_data('<%s%s>' % (name, attrs)) return @@ -1341,7 +1350,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): self._smartPop(name) if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): + and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): return tag = Tag(self, name, attrs, self.currentTag, self.previous) @@ -1352,16 +1361,16 @@ class BeautifulStoneSoup(Tag, SGMLParser): if selfClosing or self.isSelfClosingTag(name): self.popTag() if name in self.QUOTE_TAGS: - #print "Beginning quote (%s)" % name + # print "Beginning quote (%s)" % name self.quoteStack.append(name) self.literal = 1 return tag def unknown_endtag(self, name): - #print "End tag %s" % name + # print "End tag %s" % name if self.quoteStack and self.quoteStack[-1] != name: - #This is not a real end tag. - #print "</%s> is not real!" % name + # This is not a real end tag. + # print "</%s> is not real!" % name self.handle_data('</%s>' % name) return self.endData() @@ -1412,27 +1421,27 @@ class BeautifulStoneSoup(Tag, SGMLParser): pass if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref + not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref if not data: # This case is different from the one above, because we # haven't already gone through a supposedly comprehensive @@ -1452,12 +1461,12 @@ class BeautifulStoneSoup(Tag, SGMLParser): declaration as a CData object.""" j = None if self.rawdata[i:i+9] == '<![CDATA[': - k = self.rawdata.find(']]>', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) else: try: j = SGMLParser.parse_declaration(self, i) @@ -1467,6 +1476,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): j = i + len(toHandle) return j + class BeautifulSoup(BeautifulStoneSoup): """This parser knows the following facts about HTML: @@ -1522,46 +1532,46 @@ class BeautifulSoup(BeautifulStoneSoup): BeautifulStoneSoup.__init__(self, *args, **kwargs) SELF_CLOSING_TAGS = buildTagMap(None, - ('br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base', 'col')) + ('br', 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base', 'col')) PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) - QUOTE_TAGS = {'script' : None, 'textarea' : None} + QUOTE_TAGS = {'script': None, 'textarea': None} - #According to the HTML standard, each of these inline tags can - #contain another tag of the same type. Furthermore, it's common - #to actually use these tags this way. + # According to the HTML standard, each of these inline tags can + # contain another tag of the same type. Furthermore, it's common + # to actually use these tags this way. NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center') - #According to the HTML standard, these block tags can contain - #another tag of the same type. Furthermore, it's common - #to actually use these tags this way. + # According to the HTML standard, these block tags can contain + # another tag of the same type. Furthermore, it's common + # to actually use these tags this way. NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') - #Lists can contain other lists, but there are restrictions. - NESTABLE_LIST_TAGS = { 'ol' : [], - 'ul' : [], - 'li' : ['ul', 'ol'], - 'dl' : [], - 'dd' : ['dl'], - 'dt' : ['dl'] } - - #Tables can contain other tables, but there are restrictions. - NESTABLE_TABLE_TAGS = {'table' : [], - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], - 'td' : ['tr'], - 'th' : ['tr'], - 'thead' : ['table'], - 'tbody' : ['table'], - 'tfoot' : ['table'], + # Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = {'ol': [], + 'ul': [], + 'li': ['ul', 'ol'], + 'dl': [], + 'dd': ['dl'], + 'dt': ['dl']} + + # Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table': [], + 'tr': ['table', 'tbody', 'tfoot', 'thead'], + 'td': ['tr'], + 'th': ['tr'], + 'thead': ['table'], + 'tbody': ['table'], + 'tfoot': ['table'], } NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') - #If one of these tags is encountered, all tags up to the next tag of - #this type are popped. + # If one of these tags is encountered, all tags up to the next tag of + # this type are popped. RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', NON_NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, @@ -1591,11 +1601,11 @@ class BeautifulSoup(BeautifulStoneSoup): contentType = value contentTypeIndex = i - if httpEquiv and contentType: # It's an interesting meta tag. + if httpEquiv and contentType: # It's an interesting meta tag. match = self.CHARSET_RE.search(contentType) if match: if (self.declaredHTMLEncoding is not None or - self.originalEncoding == self.fromEncoding): + self.originalEncoding == self.fromEncoding): # An HTML encoding was sniffed while converting # the document to Unicode, or an HTML encoding was # sniffed during a previous pass through the @@ -1620,9 +1630,11 @@ class BeautifulSoup(BeautifulStoneSoup): if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True + class StopParsing(Exception): pass + class ICantBelieveItsBeautifulSoup(BeautifulSoup): """The BeautifulSoup class is oriented towards skipping over @@ -1649,9 +1661,9 @@ class ICantBelieveItsBeautifulSoup(BeautifulSoup): wouldn't be.""" I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ - ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big') + ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big') I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) @@ -1659,6 +1671,7 @@ class ICantBelieveItsBeautifulSoup(BeautifulSoup): I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + class MinimalSoup(BeautifulSoup): """The MinimalSoup class is for parsing HTML that contains pathologically bad markup. It makes no assumptions about tag @@ -1672,6 +1685,7 @@ class MinimalSoup(BeautifulSoup): RESET_NESTING_TAGS = buildTagMap('noscript') NESTABLE_TAGS = {} + class BeautifulSOAP(BeautifulStoneSoup): """This class will push a tag with only a single string child into the tag's parent as an attribute. The attribute's name is the tag @@ -1699,26 +1713,36 @@ class BeautifulSOAP(BeautifulStoneSoup): parent._getAttrMap() if (isinstance(tag, Tag) and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableString) and - not parent.attrMap.has_key(tag.name)): + not parent.attrMap.has_key(tag.name)): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self) -#Enterprise class names! It has come to our attention that some people -#think the names of the Beautiful Soup parser classes are too silly -#and "unprofessional" for use in enterprise screen-scraping. We feel -#your pain! For such-minded folk, the Beautiful Soup Consortium And -#All-Night Kosher Bakery recommends renaming this file to -#"RobustParser.py" (or, in cases of extreme enterprisiness, -#"RobustParserBeanInterface.class") and using the following -#enterprise-friendly class aliases: +# Enterprise class names! It has come to our attention that some people +# think the names of the Beautiful Soup parser classes are too silly +# and "unprofessional" for use in enterprise screen-scraping. We feel +# your pain! For such-minded folk, the Beautiful Soup Consortium And +# All-Night Kosher Bakery recommends renaming this file to +# "RobustParser.py" (or, in cases of extreme enterprisiness, +# "RobustParserBeanInterface.class") and using the following +# enterprise-friendly class aliases: + + class RobustXMLParser(BeautifulStoneSoup): pass + + class RobustHTMLParser(BeautifulSoup): pass + + class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): pass + + class RobustInsanelyWackAssHTMLParser(MinimalSoup): pass + + class SimplifyingSOAPParser(BeautifulSOAP): pass @@ -1732,6 +1756,7 @@ class SimplifyingSOAPParser(BeautifulSOAP): # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi # (XML) and BeautifulSoup.start_meta (HTML). + # Autodetects character encodings. # Download from http://chardet.feedparser.org/ try: @@ -1753,6 +1778,7 @@ try: except ImportError: pass + class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is @@ -1763,14 +1789,14 @@ class UnicodeDammit: # meta tags to the corresponding Python codec names. It only covers # values that aren't in Python's aliases and can't be determined # by the heuristics in find_codec. - CHARSET_ALIASES = { "macintosh" : "mac-roman", - "x-sjis" : "shift-jis" } + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False): self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ - self._detectEncoding(markup, isHTML) + self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): @@ -1781,11 +1807,13 @@ class UnicodeDammit: u = None for proposedEncoding in overrideEncodings: u = self._convertFrom(proposedEncoding) - if u: break + if u: + break if not u: for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) - if u: break + if u: + break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): @@ -1795,10 +1823,12 @@ class UnicodeDammit: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) - if u: break + if u: + break self.unicode = u - if not u: self.originalEncoding = None + if not u: + self.originalEncoding = None def _subMSChar(self, orig): """Changes a MS smart quote character to an XML or HTML @@ -1823,9 +1853,8 @@ class UnicodeDammit: if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): - markup = re.compile("([\x80-\x9f])").sub \ - (lambda(x): self._subMSChar(x.group(1)), - markup) + markup = re.compile("([\x80-\x9f])").sub(lambda(x): self._subMSChar(x.group(1)), + markup) try: # print "Trying to convert document to %s" % proposed @@ -1836,7 +1865,7 @@ class UnicodeDammit: # print "That didn't work!" # print e return None - #print "Correct encoding: %s" % proposed + # print "Correct encoding: %s" % proposed return self.markup def _toUnicode(self, data, encoding): @@ -1845,11 +1874,11 @@ class UnicodeDammit: # strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): + and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' data = data[2:] elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): + and (data[2:4] != '\x00\x00'): encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': @@ -1876,7 +1905,7 @@ class UnicodeDammit: sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ - and (xml_data[2:4] != '\x00\x00'): + and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') @@ -1885,7 +1914,7 @@ class UnicodeDammit: sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ - (xml_data[2:4] != '\x00\x00'): + (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') @@ -1931,15 +1960,15 @@ class UnicodeDammit: xml_encoding = sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding - def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ - or charset + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset def _codec(self, charset): - if not charset: return charset + if not charset: + return charset codec = None try: codecs.lookup(charset) @@ -1949,68 +1978,69 @@ class UnicodeDammit: return codec EBCDIC_TO_ASCII_MAP = None + def _ebcdic_to_ascii(self, s): c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) + emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, + 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, + 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, + 32, 160, 161, 162, 163, 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33, + 38, 169, 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59, 94, + 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 44, 37, 95, 62, 63, + 186, 187, 188, 189, 190, 191, 192, 193, 194, 96, 58, 35, 64, 39, 61, 34, + 195, 97, 98, 99, 100, 101, 102, 103, 104, 105, 196, 197, 198, 199, 200, + 201, 202, 106, 107, 108, 109, 110, 111, 112, 113, 114, 203, 204, 205, + 206, 207, 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210, + 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 123, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 232, 233, 234, 235, 236, 237, 125, 74, 75, 76, 77, 78, 79, 80, 81, + 82, 238, 239, 240, 241, 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89, + 90, 244, 245, 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + 250, 251, 252, 253, 254, 255) import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + c.EBCDIC_TO_ASCII_MAP = string.maketrans( + ''.join(map(chr, range(256))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) - MS_CHARS = { '\x80' : ('euro', '20AC'), - '\x81' : ' ', - '\x82' : ('sbquo', '201A'), - '\x83' : ('fnof', '192'), - '\x84' : ('bdquo', '201E'), - '\x85' : ('hellip', '2026'), - '\x86' : ('dagger', '2020'), - '\x87' : ('Dagger', '2021'), - '\x88' : ('circ', '2C6'), - '\x89' : ('permil', '2030'), - '\x8A' : ('Scaron', '160'), - '\x8B' : ('lsaquo', '2039'), - '\x8C' : ('OElig', '152'), - '\x8D' : '?', - '\x8E' : ('#x17D', '17D'), - '\x8F' : '?', - '\x90' : '?', - '\x91' : ('lsquo', '2018'), - '\x92' : ('rsquo', '2019'), - '\x93' : ('ldquo', '201C'), - '\x94' : ('rdquo', '201D'), - '\x95' : ('bull', '2022'), - '\x96' : ('ndash', '2013'), - '\x97' : ('mdash', '2014'), - '\x98' : ('tilde', '2DC'), - '\x99' : ('trade', '2122'), - '\x9a' : ('scaron', '161'), - '\x9b' : ('rsaquo', '203A'), - '\x9c' : ('oelig', '153'), - '\x9d' : '?', - '\x9e' : ('#x17E', '17E'), - '\x9f' : ('Yuml', ''),} + MS_CHARS = {'\x80': ('euro', '20AC'), + '\x81': ' ', + '\x82': ('sbquo', '201A'), + '\x83': ('fnof', '192'), + '\x84': ('bdquo', '201E'), + '\x85': ('hellip', '2026'), + '\x86': ('dagger', '2020'), + '\x87': ('Dagger', '2021'), + '\x88': ('circ', '2C6'), + '\x89': ('permil', '2030'), + '\x8A': ('Scaron', '160'), + '\x8B': ('lsaquo', '2039'), + '\x8C': ('OElig', '152'), + '\x8D': '?', + '\x8E': ('#x17D', '17D'), + '\x8F': '?', + '\x90': '?', + '\x91': ('lsquo', '2018'), + '\x92': ('rsquo', '2019'), + '\x93': ('ldquo', '201C'), + '\x94': ('rdquo', '201D'), + '\x95': ('bull', '2022'), + '\x96': ('ndash', '2013'), + '\x97': ('mdash', '2014'), + '\x98': ('tilde', '2DC'), + '\x99': ('trade', '2122'), + '\x9a': ('scaron', '161'), + '\x9b': ('rsaquo', '203A'), + '\x9c': ('oelig', '153'), + '\x9d': '?', + '\x9e': ('#x17E', '17E'), + '\x9f': ('Yuml', ''), } ####################################################################### -#By default, act as an HTML pretty-printer. +# By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) |