1 files changed, 255 insertions, 225 deletions
diff --git a/cgi/BeautifulSoup.py b/cgi/BeautifulSoup.py
index 7278215..3e97785 100644
--- a/cgi/BeautifulSoup.py
+++ b/cgi/BeautifulSoup.py
@@ -90,26 +90,28 @@ import types
 import re
 import sgmllib
 try:
-  from htmlentitydefs import name2codepoint
+    from htmlentitydefs import name2codepoint
 except ImportError:
-  name2codepoint = {}
+    name2codepoint = {}
 try:
     set
 except NameError:
     from sets import Set as set
 
-#These hacks make Beautiful Soup able to parse XML with namespaces
+# These hacks make Beautiful Soup able to parse XML with namespaces
 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
+
 def _match_css_class(str):
     """Build a RE to match the given CSS class."""
     return re.compile(r"(^|.*\s)%s($|\s)" % str)
 
 # First, the classes that represent markup elements.
 
+
 class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
@@ -117,15 +119,15 @@ class PageElement(object):
     def _invert(h):
         "Cheap function to invert a hash."
         i = {}
-        for k,v in h.items():
+        for k, v in h.items():
             i[v] = k
         return i
 
-    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
-                                      "quot" : '"',
-                                      "amp" : "&",
-                                      "lt" : "<",
-                                      "gt" : ">" }
+    XML_ENTITIES_TO_SPECIAL_CHARS = {"apos": "'",
+                                     "quot": '"',
+                                     "amp": "&",
+                                     "lt": "<",
+                                     "gt": ">"}
 
     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 
@@ -145,7 +147,7 @@ class PageElement(object):
         oldParent = self.parent
         myIndex = self.parent.index(self)
         if hasattr(replaceWith, "parent")\
-                  and replaceWith.parent is self.parent:
+                and replaceWith.parent is self.parent:
             # We're replacing this element with one of its siblings.
             index = replaceWith.parent.index(replaceWith)
             if index and index < myIndex:
@@ -173,9 +175,9 @@ class PageElement(object):
             except ValueError:
                 pass
 
-        #Find the two elements that would be next to each other if
-        #this element (and any children) hadn't been parsed. Connect
-        #the two.
+        # Find the two elements that would be next to each other if
+        # this element (and any children) hadn't been parsed. Connect
+        # the two.
         lastChild = self._lastRecursiveChild()
         nextElement = lastChild.next
 
@@ -203,10 +205,10 @@ class PageElement(object):
 
     def insert(self, position, newChild):
         if isinstance(newChild, basestring) \
-            and not isinstance(newChild, NavigableString):
+                and not isinstance(newChild, NavigableString):
             newChild = NavigableString(newChild)
 
-        position =  min(position, len(self.contents))
+        position = min(position, len(self.contents))
         if hasattr(newChild, 'parent') and newChild.parent is not None:
             # We're 'inserting' an element that's already one
             # of this object's children.
@@ -243,7 +245,7 @@ class PageElement(object):
             while not parentsNextSibling:
                 parentsNextSibling = parent.nextSibling
                 parent = parent.parent
-                if not parent: # This is the last element in the document.
+                if not parent:  # This is the last element in the document.
                     break
             if parentsNextSibling:
                 newChildsLastElement.next = parentsNextSibling
@@ -288,7 +290,7 @@ class PageElement(object):
         criteria and appear after this Tag in the document."""
         return self._findAll(name, attrs, text, limit,
                              self.nextSiblingGenerator, **kwargs)
-    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+    fetchNextSiblings = findNextSiblings  # Compatibility with pre-3.x
 
     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the first item that matches the given criteria and
@@ -300,8 +302,8 @@ class PageElement(object):
         """Returns all items that match the given criteria and appear
         before this Tag in the document."""
         return self._findAll(name, attrs, text, limit, self.previousGenerator,
-                           **kwargs)
-    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+                             **kwargs)
+    fetchPrevious = findAllPrevious  # Compatibility with pre-3.x
 
     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the closest sibling to this Tag that matches the
@@ -315,7 +317,7 @@ class PageElement(object):
         criteria and appear before this Tag in the document."""
         return self._findAll(name, attrs, text, limit,
                              self.previousSiblingGenerator, **kwargs)
-    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+    fetchPreviousSiblings = findPreviousSiblings  # Compatibility with pre-3.x
 
     def findParent(self, name=None, attrs={}, **kwargs):
         """Returns the closest parent of this Tag that matches the given
@@ -334,9 +336,9 @@ class PageElement(object):
 
         return self._findAll(name, attrs, None, limit, self.parentGenerator,
                              **kwargs)
-    fetchParents = findParents # Compatibility with pre-3.x
+    fetchParents = findParents  # Compatibility with pre-3.x
 
-    #These methods do the real heavy lifting.
+    # These methods do the real heavy lifting.
 
     def _findOne(self, method, name, attrs, text, **kwargs):
         r = None
@@ -381,8 +383,8 @@ class PageElement(object):
                         break
         return results
 
-    #These Generators can be used to navigate starting from both
-    #NavigableStrings and Tags.
+    # These Generators can be used to navigate starting from both
+    # NavigableStrings and Tags.
     def nextGenerator(self):
         i = self
         while i is not None:
@@ -431,7 +433,7 @@ class PageElement(object):
                 s = unicode(s)
         else:
             if encoding:
-                s  = self.toEncoding(str(s), encoding)
+                s = self.toEncoding(str(s), encoding)
             else:
                 s = unicode(s)
         return s
@@ -483,11 +485,13 @@ class NavigableString(unicode, PageElement):
         else:
             return data
 
+
 class CData(NavigableString):
 
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 
+
 class ProcessingInstruction(NavigableString):
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         output = self
@@ -495,14 +499,17 @@ class ProcessingInstruction(NavigableString):
             output = self.substituteEncoding(output, encoding)
         return "<?%s?>" % self.toEncoding(output, encoding)
 
+
 class Comment(NavigableString):
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return "<!--%s-->" % NavigableString.__str__(self, encoding)
 
+
 class Declaration(NavigableString):
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return "<!%s>" % NavigableString.__str__(self, encoding)
 
+
 class Tag(PageElement):
 
     """Represents a found HTML tag with its attributes and contents."""
@@ -555,15 +562,15 @@ class Tag(PageElement):
         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 
         # Convert any HTML, XML, or numeric entities in the attribute values.
-        convert = lambda(k, val): (k,
-                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
-                                          self._convertEntities,
-                                          val))
+        def convert((k, val)): return (k,
+                                       re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+                                              self._convertEntities,
+                                              val))
         self.attrs = map(convert, self.attrs)
 
     def getString(self):
         if (len(self.contents) == 1
-            and isinstance(self.contents[0], NavigableString)):
+                and isinstance(self.contents[0], NavigableString)):
             return self.contents[0]
 
     def setString(self, string):
@@ -646,8 +653,8 @@ class Tag(PageElement):
         for item in self.attrs:
             if item[0] == key:
                 self.attrs.remove(item)
-                #We don't break because bad HTML can define the same
-                #attribute multiple times.
+                # We don't break because bad HTML can define the same
+                # attribute multiple times.
             self._getAttrMap()
             if self.attrMap.has_key(key):
                 del self.attrMap[key]
@@ -659,7 +666,7 @@ class Tag(PageElement):
         return apply(self.findAll, args, kwargs)
 
     def __getattr__(self, tag):
-        #print "Getattr %s.%s" % (self.__class__, tag)
+        # print "Getattr %s.%s" % (self.__class__, tag)
         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
             return self.find(tag[:-3])
         elif tag.find('__') != 0:
@@ -738,7 +745,8 @@ class Tag(PageElement):
                     # value might also contain angle brackets, or
                     # ampersands that aren't part of entities. We need
                     # to escape those to XML entities too.
-                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(
+                        self._sub_entity, val)
 
                 attrs.append(fmt % (self.toEncoding(key, encoding),
                                     self.toEncoding(val, encoding)))
@@ -802,7 +810,7 @@ class Tag(PageElement):
                        prettyPrint=False, indentLevel=0):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
-        s=[]
+        s = []
         for c in self:
             text = None
             if isinstance(c, NavigableString):
@@ -819,7 +827,7 @@ class Tag(PageElement):
                     s.append("\n")
         return ''.join(s)
 
-    #Soup methods
+    # Soup methods
 
     def find(self, name=None, attrs={}, recursive=True, text=None,
              **kwargs):
@@ -859,7 +867,7 @@ class Tag(PageElement):
     def firstText(self, text=None, recursive=True):
         return self.find(text=text, recursive=recursive)
 
-    #Private methods
+    # Private methods
 
     def _getAttrMap(self):
         """Initializes a map representation of this tag's attributes,
@@ -870,7 +878,7 @@ class Tag(PageElement):
                 self.attrMap[key] = value
         return self.attrMap
 
-    #Generator methods
+    # Generator methods
     def childGenerator(self):
         # Just use the iterator from the contents
         return iter(self.contents)
@@ -917,12 +925,12 @@ class SoupStrainer:
             markup = markupName
             markupAttrs = markup
         callFunctionWithTagData = callable(self.name) \
-                                and not isinstance(markupName, Tag)
+            and not isinstance(markupName, Tag)
 
         if (not self.name) \
-               or callFunctionWithTagData \
-               or (markup and self._matches(markup, self.name)) \
-               or (not markup and self._matches(markupName, self.name)):
+                or callFunctionWithTagData \
+                or (markup and self._matches(markup, self.name)) \
+                or (not markup and self._matches(markupName, self.name)):
             if callFunctionWithTagData:
                 match = self.name(markupName, markupAttrs)
             else:
@@ -930,11 +938,11 @@ class SoupStrainer:
                 markupAttrMap = None
                 for attr, matchAgainst in self.attrs.items():
                     if not markupAttrMap:
-                         if hasattr(markupAttrs, 'get'):
+                        if hasattr(markupAttrs, 'get'):
                             markupAttrMap = markupAttrs
-                         else:
+                        else:
                             markupAttrMap = {}
-                            for k,v in markupAttrs:
+                            for k, v in markupAttrs:
                                 markupAttrMap[k] = v
                     attrValue = markupAttrMap.get(attr)
                     if not self._matches(attrValue, matchAgainst):
@@ -948,7 +956,7 @@ class SoupStrainer:
         return found
 
     def search(self, markup):
-        #print 'looking for %s in %s' % (self, markup)
+        # print 'looking for %s in %s' % (self, markup)
         found = None
         # If given a list of items, scan it for a text element that
         # matches.
@@ -956,7 +964,7 @@ class SoupStrainer:
                 and not isinstance(markup, Tag):
             for element in markup:
                 if isinstance(element, NavigableString) \
-                       and self.search(element):
+                        and self.search(element):
                     found = element
                     break
         # If it's a Tag, make sure its name or attributes match.
@@ -966,33 +974,33 @@ class SoupStrainer:
                 found = self.searchTag(markup)
         # If it's text, make sure the text matches.
         elif isinstance(markup, NavigableString) or \
-                 isinstance(markup, basestring):
+                isinstance(markup, basestring):
             if self._matches(markup, self.text):
                 found = markup
         else:
             raise Exception, "I don't know how to match against a %s" \
-                  % markup.__class__
+                % markup.__class__
         return found
 
     def _matches(self, markup, matchAgainst):
-        #print "Matching %s against %s" % (markup, matchAgainst)
+        # print "Matching %s against %s" % (markup, matchAgainst)
         result = False
         if matchAgainst is True:
             result = markup is not None
         elif callable(matchAgainst):
             result = matchAgainst(markup)
         else:
-            #Custom match methods take the tag as an argument, but all
-            #other ways of matching match the tag name as a string.
+            # Custom match methods take the tag as an argument, but all
+            # other ways of matching match the tag name as a string.
             if isinstance(markup, Tag):
                 markup = markup.name
             if markup and not isinstance(markup, basestring):
                 markup = unicode(markup)
-            #Now we know that chunk is either a string, or None.
+            # Now we know that chunk is either a string, or None.
             if hasattr(matchAgainst, 'match'):
                 # It's a regexp object.
                 result = markup and matchAgainst.search(markup)
-            elif hasattr(matchAgainst, '__iter__'): # list-like
+            elif hasattr(matchAgainst, '__iter__'):  # list-like
                 result = markup in matchAgainst
             elif hasattr(matchAgainst, 'items'):
                 result = markup.has_key(matchAgainst)
@@ -1006,15 +1014,18 @@ class SoupStrainer:
                 result = matchAgainst == markup
         return result
 
+
 class ResultSet(list):
     """A ResultSet is just a list that keeps track of the SoupStrainer
     that created it."""
+
     def __init__(self, source):
         list.__init__([])
         self.source = source
 
 # Now, some helper functions.
 
+
 def buildTagMap(default, *args):
     """Turns a list of maps, lists, or scalars into a single map.
     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
@@ -1022,20 +1033,21 @@ def buildTagMap(default, *args):
     built = {}
     for portion in args:
         if hasattr(portion, 'items'):
-            #It's a map. Merge it.
-            for k,v in portion.items():
+            # It's a map. Merge it.
+            for k, v in portion.items():
                 built[k] = v
-        elif hasattr(portion, '__iter__'): # is a list
-            #It's a list. Map each item to the default.
+        elif hasattr(portion, '__iter__'):  # is a list
+            # It's a list. Map each item to the default.
             for k in portion:
                 built[k] = default
         else:
-            #It's a scalar. Map it to the default.
+            # It's a scalar. Map it to the default.
             built[portion] = default
     return built
 
 # Now, the parser classes.
 
+
 class BeautifulStoneSoup(Tag, SGMLParser):
 
     """This class contains the basic parser and search code. It defines
@@ -1078,7 +1090,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
     # can be replaced with a single space. A text node that contains
     # fancy Unicode spaces (usually non-breaking) should be left
     # alone.
-    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
 
     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
@@ -1155,7 +1167,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
             n = int(name)
         except ValueError:
             return
-        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+        if not 0 <= n <= 127:  # ASCII ends at 127, not 255
             return
         return self.convert_codepoint(n)
 
@@ -1166,9 +1178,8 @@ class BeautifulStoneSoup(Tag, SGMLParser):
             if not hasattr(self, 'originalEncoding'):
                 self.originalEncoding = None
         else:
-            dammit = UnicodeDammit\
-                     (markup, [self.fromEncoding, inDocumentEncoding],
-                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+            dammit = UnicodeDammit(markup, [self.fromEncoding, inDocumentEncoding],
+                                   smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
             markup = dammit.unicode
             self.originalEncoding = dammit.originalEncoding
             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
@@ -1195,10 +1206,10 @@ class BeautifulStoneSoup(Tag, SGMLParser):
     def __getattr__(self, methodName):
         """This method routes method call requests to either the SGMLParser
         superclass or the Tag superclass, depending on the method name."""
-        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+        # print "__getattr__ called on %s.%s" % (self.__class__, methodName)
 
         if methodName.startswith('start_') or methodName.startswith('end_') \
-               or methodName.startswith('do_'):
+                or methodName.startswith('do_'):
             return SGMLParser.__getattr__(self, methodName)
         elif not methodName.startswith('__'):
             return Tag.__getattr__(self, methodName)
@@ -1209,7 +1220,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
         """Returns true iff the given string is the name of a
         self-closing tag according to this parser."""
         return self.SELF_CLOSING_TAGS.has_key(name) \
-               or self.instanceSelfClosingTags.has_key(name)
+            or self.instanceSelfClosingTags.has_key(name)
 
     def reset(self):
         Tag.__init__(self, self, self.ROOT_TAG_NAME)
@@ -1224,13 +1235,13 @@ class BeautifulStoneSoup(Tag, SGMLParser):
     def popTag(self):
         tag = self.tagStack.pop()
 
-        #print "Pop", tag.name
+        # print "Pop", tag.name
         if self.tagStack:
             self.currentTag = self.tagStack[-1]
         return self.currentTag
 
     def pushTag(self, tag):
-        #print "Push", tag.name
+        # print "Push", tag.name
         if self.currentTag:
             self.currentTag.contents.append(tag)
         self.tagStack.append(tag)
@@ -1248,7 +1259,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
                     currentData = ' '
             self.currentData = []
             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
-                   (not self.parseOnlyThese.text or \
+                (not self.parseOnlyThese.text or
                     not self.parseOnlyThese.search(currentData)):
                 return
             o = containerClass(currentData)
@@ -1258,13 +1269,12 @@ class BeautifulStoneSoup(Tag, SGMLParser):
             self.previous = o
             self.currentTag.contents.append(o)
 
-
     def _popToTag(self, name, inclusivePop=True):
         """Pops the tag stack up to and including the most recent
         instance of the given tag. If inclusivePop is false, pops the tag
         stack up to but *not* including the most recent instqance of
         the given tag."""
-        #print "Popping to %s" % name
+        # print "Popping to %s" % name
         if name == self.ROOT_TAG_NAME:
             return
 
@@ -1282,7 +1292,6 @@ class BeautifulStoneSoup(Tag, SGMLParser):
         return mostRecentTag
 
     def _smartPop(self, name):
-
         """We need to pop up to the previous tag of this type, unless
         one of this tag's nesting reset triggers comes between this
         tag and the previous tag of this type, OR unless this tag is a
@@ -1307,8 +1316,8 @@ class BeautifulStoneSoup(Tag, SGMLParser):
         for i in range(len(self.tagStack)-1, 0, -1):
             p = self.tagStack[i]
             if (not p or p.name == name) and not isNestable:
-                #Non-nestable tags get popped to the top or to their
-                #last occurance.
+                # Non-nestable tags get popped to the top or to their
+                # last occurance.
                 popTo = name
                 break
             if (nestingResetTriggers is not None
@@ -1316,10 +1325,10 @@ class BeautifulStoneSoup(Tag, SGMLParser):
                 or (nestingResetTriggers is None and isResetNesting
                     and self.RESET_NESTING_TAGS.has_key(p.name)):
 
-                #If we encounter one of the nesting reset triggers
-                #peculiar to this tag, or we encounter another tag
-                #that causes nesting to reset, pop up to but not
-                #including that tag.
+                # If we encounter one of the nesting reset triggers
+                # peculiar to this tag, or we encounter another tag
+                # that causes nesting to reset, pop up to but not
+                # including that tag.
                 popTo = p.name
                 inclusive = False
                 break
@@ -1328,10 +1337,10 @@ class BeautifulStoneSoup(Tag, SGMLParser):
             self._popToTag(popTo, inclusive)
 
     def unknown_starttag(self, name, attrs, selfClosing=0):
-        #print "Start tag %s: %s" % (name, attrs)
+        # print "Start tag %s: %s" % (name, attrs)
         if self.quoteStack:
-            #This is not a real tag.
-            #print "<%s> is not real!" % name
+            # This is not a real tag.
+            # print "<%s> is not real!" % name
             attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
             self.handle_data('<%s%s>' % (name, attrs))
             return
@@ -1341,7 +1350,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
             self._smartPop(name)
 
         if self.parseOnlyThese and len(self.tagStack) <= 1 \
-               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
             return
 
         tag = Tag(self, name, attrs, self.currentTag, self.previous)
@@ -1352,16 +1361,16 @@ class BeautifulStoneSoup(Tag, SGMLParser):
         if selfClosing or self.isSelfClosingTag(name):
             self.popTag()
         if name in self.QUOTE_TAGS:
-            #print "Beginning quote (%s)" % name
+            # print "Beginning quote (%s)" % name
             self.quoteStack.append(name)
             self.literal = 1
         return tag
 
     def unknown_endtag(self, name):
-        #print "End tag %s" % name
+        # print "End tag %s" % name
         if self.quoteStack and self.quoteStack[-1] != name:
-            #This is not a real end tag.
-            #print "</%s> is not real!" % name
+            # This is not a real end tag.
+            # print "</%s> is not real!" % name
             self.handle_data('</%s>' % name)
             return
         self.endData()
@@ -1412,27 +1421,27 @@ class BeautifulStoneSoup(Tag, SGMLParser):
                 pass
 
         if not data and self.convertXMLEntities:
-                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+            data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
 
         if not data and self.convertHTMLEntities and \
-            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
-                # TODO: We've got a problem here. We're told this is
-                # an entity reference, but it's not an XML entity
-                # reference or an HTML entity reference. Nonetheless,
-                # the logical thing to do is to pass it through as an
-                # unrecognized entity reference.
-                #
-                # Except: when the input is "&carol;" this function
-                # will be called with input "carol". When the input is
-                # "AT&T", this function will be called with input
-                # "T". We have no way of knowing whether a semicolon
-                # was present originally, so we don't know whether
-                # this is an unknown entity or just a misplaced
-                # ampersand.
-                #
-                # The more common case is a misplaced ampersand, so I
-                # escape the ampersand and omit the trailing semicolon.
-                data = "&amp;%s" % ref
+                not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+            # TODO: We've got a problem here. We're told this is
+            # an entity reference, but it's not an XML entity
+            # reference or an HTML entity reference. Nonetheless,
+            # the logical thing to do is to pass it through as an
+            # unrecognized entity reference.
+            #
+            # Except: when the input is "&carol;" this function
+            # will be called with input "carol". When the input is
+            # "AT&T", this function will be called with input
+            # "T". We have no way of knowing whether a semicolon
+            # was present originally, so we don't know whether
+            # this is an unknown entity or just a misplaced
+            # ampersand.
+            #
+            # The more common case is a misplaced ampersand, so I
+            # escape the ampersand and omit the trailing semicolon.
+            data = "&amp;%s" % ref
         if not data:
             # This case is different from the one above, because we
             # haven't already gone through a supposedly comprehensive
@@ -1452,12 +1461,12 @@ class BeautifulStoneSoup(Tag, SGMLParser):
         declaration as a CData object."""
         j = None
         if self.rawdata[i:i+9] == '<![CDATA[':
-             k = self.rawdata.find(']]>', i)
-             if k == -1:
-                 k = len(self.rawdata)
-             data = self.rawdata[i+9:k]
-             j = k+3
-             self._toStringSubclass(data, CData)
+            k = self.rawdata.find(']]>', i)
+            if k == -1:
+                k = len(self.rawdata)
+            data = self.rawdata[i+9:k]
+            j = k+3
+            self._toStringSubclass(data, CData)
         else:
             try:
                 j = SGMLParser.parse_declaration(self, i)
@@ -1467,6 +1476,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
                 j = i + len(toHandle)
         return j
 
+
 class BeautifulSoup(BeautifulStoneSoup):
 
     """This parser knows the following facts about HTML:
@@ -1522,46 +1532,46 @@ class BeautifulSoup(BeautifulStoneSoup):
         BeautifulStoneSoup.__init__(self, *args, **kwargs)
 
     SELF_CLOSING_TAGS = buildTagMap(None,
-                                    ('br' , 'hr', 'input', 'img', 'meta',
-                                    'spacer', 'link', 'frame', 'base', 'col'))
+                                    ('br', 'hr', 'input', 'img', 'meta',
+                                     'spacer', 'link', 'frame', 'base', 'col'))
 
     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
 
-    QUOTE_TAGS = {'script' : None, 'textarea' : None}
+    QUOTE_TAGS = {'script': None, 'textarea': None}
 
-    #According to the HTML standard, each of these inline tags can
-    #contain another tag of the same type. Furthermore, it's common
-    #to actually use these tags this way.
+    # According to the HTML standard, each of these inline tags can
+    # contain another tag of the same type. Furthermore, it's common
+    # to actually use these tags this way.
     NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
                             'center')
 
-    #According to the HTML standard, these block tags can contain
-    #another tag of the same type. Furthermore, it's common
-    #to actually use these tags this way.
+    # According to the HTML standard, these block tags can contain
+    # another tag of the same type. Furthermore, it's common
+    # to actually use these tags this way.
     NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
 
-    #Lists can contain other lists, but there are restrictions.
-    NESTABLE_LIST_TAGS = { 'ol' : [],
-                           'ul' : [],
-                           'li' : ['ul', 'ol'],
-                           'dl' : [],
-                           'dd' : ['dl'],
-                           'dt' : ['dl'] }
-
-    #Tables can contain other tables, but there are restrictions.
-    NESTABLE_TABLE_TAGS = {'table' : [],
-                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
-                           'td' : ['tr'],
-                           'th' : ['tr'],
-                           'thead' : ['table'],
-                           'tbody' : ['table'],
-                           'tfoot' : ['table'],
+    # Lists can contain other lists, but there are restrictions.
+    NESTABLE_LIST_TAGS = {'ol': [],
+                          'ul': [],
+                          'li': ['ul', 'ol'],
+                          'dl': [],
+                          'dd': ['dl'],
+                          'dt': ['dl']}
+
+    # Tables can contain other tables, but there are restrictions.
+    NESTABLE_TABLE_TAGS = {'table': [],
+                           'tr': ['table', 'tbody', 'tfoot', 'thead'],
+                           'td': ['tr'],
+                           'th': ['tr'],
+                           'thead': ['table'],
+                           'tbody': ['table'],
+                           'tfoot': ['table'],
                            }
 
     NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
 
-    #If one of these tags is encountered, all tags up to the next tag of
-    #this type are popped.
+    # If one of these tags is encountered, all tags up to the next tag of
+    # this type are popped.
     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
                                      NON_NESTABLE_BLOCK_TAGS,
                                      NESTABLE_LIST_TAGS,
@@ -1591,11 +1601,11 @@ class BeautifulSoup(BeautifulStoneSoup):
                 contentType = value
                 contentTypeIndex = i
 
-        if httpEquiv and contentType: # It's an interesting meta tag.
+        if httpEquiv and contentType:  # It's an interesting meta tag.
             match = self.CHARSET_RE.search(contentType)
             if match:
                 if (self.declaredHTMLEncoding is not None or
-                    self.originalEncoding == self.fromEncoding):
+                        self.originalEncoding == self.fromEncoding):
                     # An HTML encoding was sniffed while converting
                     # the document to Unicode, or an HTML encoding was
                     # sniffed during a previous pass through the
@@ -1620,9 +1630,11 @@ class BeautifulSoup(BeautifulStoneSoup):
         if tag and tagNeedsEncodingSubstitution:
             tag.containsSubstitutions = True
 
+
 class StopParsing(Exception):
     pass
 
+
 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
 
     """The BeautifulSoup class is oriented towards skipping over
@@ -1649,9 +1661,9 @@ class ICantBelieveItsBeautifulSoup(BeautifulSoup):
     wouldn't be."""
 
     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
-     ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
-      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
-      'big')
+        ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+         'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+         'big')
 
     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
 
@@ -1659,6 +1671,7 @@ class ICantBelieveItsBeautifulSoup(BeautifulSoup):
                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
 
+
 class MinimalSoup(BeautifulSoup):
     """The MinimalSoup class is for parsing HTML that contains
     pathologically bad markup. It makes no assumptions about tag
@@ -1672,6 +1685,7 @@ class MinimalSoup(BeautifulSoup):
     RESET_NESTING_TAGS = buildTagMap('noscript')
     NESTABLE_TAGS = {}
 
+
 class BeautifulSOAP(BeautifulStoneSoup):
     """This class will push a tag with only a single string child into
     the tag's parent as an attribute. The attribute's name is the tag
@@ -1699,26 +1713,36 @@ class BeautifulSOAP(BeautifulStoneSoup):
             parent._getAttrMap()
             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
                 isinstance(tag.contents[0], NavigableString) and
-                not parent.attrMap.has_key(tag.name)):
+                    not parent.attrMap.has_key(tag.name)):
                 parent[tag.name] = tag.contents[0]
         BeautifulStoneSoup.popTag(self)
 
-#Enterprise class names! It has come to our attention that some people
-#think the names of the Beautiful Soup parser classes are too silly
-#and "unprofessional" for use in enterprise screen-scraping. We feel
-#your pain! For such-minded folk, the Beautiful Soup Consortium And
-#All-Night Kosher Bakery recommends renaming this file to
-#"RobustParser.py" (or, in cases of extreme enterprisiness,
-#"RobustParserBeanInterface.class") and using the following
-#enterprise-friendly class aliases:
+# Enterprise class names! It has come to our attention that some people
+# think the names of the Beautiful Soup parser classes are too silly
+# and "unprofessional" for use in enterprise screen-scraping. We feel
+# your pain! For such-minded folk, the Beautiful Soup Consortium And
+# All-Night Kosher Bakery recommends renaming this file to
+# "RobustParser.py" (or, in cases of extreme enterprisiness,
+# "RobustParserBeanInterface.class") and using the following
+# enterprise-friendly class aliases:
+
+
 class RobustXMLParser(BeautifulStoneSoup):
     pass
+
+
 class RobustHTMLParser(BeautifulSoup):
     pass
+
+
 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
     pass
+
+
 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
     pass
+
+
 class SimplifyingSOAPParser(BeautifulSOAP):
     pass
 
@@ -1732,6 +1756,7 @@ class SimplifyingSOAPParser(BeautifulSOAP):
 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
 # (XML) and BeautifulSoup.start_meta (HTML).
 
+
 # Autodetects character encodings.
 # Download from http://chardet.feedparser.org/
 try:
@@ -1753,6 +1778,7 @@ try:
 except ImportError:
     pass
 
+
 class UnicodeDammit:
     """A class for detecting the encoding of a *ML document and
     converting it to a Unicode string. If the source encoding is
@@ -1763,14 +1789,14 @@ class UnicodeDammit:
     # meta tags to the corresponding Python codec names. It only covers
     # values that aren't in Python's aliases and can't be determined
     # by the heuristics in find_codec.
-    CHARSET_ALIASES = { "macintosh" : "mac-roman",
-                        "x-sjis" : "shift-jis" }
+    CHARSET_ALIASES = {"macintosh": "mac-roman",
+                       "x-sjis": "shift-jis"}
 
     def __init__(self, markup, overrideEncodings=[],
                  smartQuotesTo='xml', isHTML=False):
         self.declaredHTMLEncoding = None
         self.markup, documentEncoding, sniffedEncoding = \
-                     self._detectEncoding(markup, isHTML)
+            self._detectEncoding(markup, isHTML)
         self.smartQuotesTo = smartQuotesTo
         self.triedEncodings = []
         if markup == '' or isinstance(markup, unicode):
@@ -1781,11 +1807,13 @@ class UnicodeDammit:
         u = None
         for proposedEncoding in overrideEncodings:
             u = self._convertFrom(proposedEncoding)
-            if u: break
+            if u:
+                break
         if not u:
             for proposedEncoding in (documentEncoding, sniffedEncoding):
                 u = self._convertFrom(proposedEncoding)
-                if u: break
+                if u:
+                    break
 
         # If no luck and we have auto-detection library, try that:
         if not u and chardet and not isinstance(self.markup, unicode):
@@ -1795,10 +1823,12 @@ class UnicodeDammit:
         if not u:
             for proposed_encoding in ("utf-8", "windows-1252"):
                 u = self._convertFrom(proposed_encoding)
-                if u: break
+                if u:
+                    break
 
         self.unicode = u
-        if not u: self.originalEncoding = None
+        if not u:
+            self.originalEncoding = None
 
     def _subMSChar(self, orig):
         """Changes a MS smart quote character to an XML or HTML
@@ -1823,9 +1853,8 @@ class UnicodeDammit:
         if self.smartQuotesTo and proposed.lower() in("windows-1252",
                                                       "iso-8859-1",
                                                       "iso-8859-2"):
-            markup = re.compile("([\x80-\x9f])").sub \
-                     (lambda(x): self._subMSChar(x.group(1)),
-                      markup)
+            markup = re.compile("([\x80-\x9f])").sub(lambda(x): self._subMSChar(x.group(1)),
+                                                     markup)
 
         try:
             # print "Trying to convert document to %s" % proposed
@@ -1836,7 +1865,7 @@ class UnicodeDammit:
             # print "That didn't work!"
             # print e
             return None
-        #print "Correct encoding: %s" % proposed
+        # print "Correct encoding: %s" % proposed
         return self.markup
 
     def _toUnicode(self, data, encoding):
@@ -1845,11 +1874,11 @@ class UnicodeDammit:
 
         # strip Byte Order Mark (if present)
         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
-               and (data[2:4] != '\x00\x00'):
+                and (data[2:4] != '\x00\x00'):
             encoding = 'utf-16be'
             data = data[2:]
         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
-                 and (data[2:4] != '\x00\x00'):
+                and (data[2:4] != '\x00\x00'):
             encoding = 'utf-16le'
             data = data[2:]
         elif data[:3] == '\xef\xbb\xbf':
@@ -1876,7 +1905,7 @@ class UnicodeDammit:
                 sniffed_xml_encoding = 'utf-16be'
                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
-                     and (xml_data[2:4] != '\x00\x00'):
+                    and (xml_data[2:4] != '\x00\x00'):
                 # UTF-16BE with BOM
                 sniffed_xml_encoding = 'utf-16be'
                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
@@ -1885,7 +1914,7 @@ class UnicodeDammit:
                 sniffed_xml_encoding = 'utf-16le'
                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
-                     (xml_data[2:4] != '\x00\x00'):
+                    (xml_data[2:4] != '\x00\x00'):
                 # UTF-16LE with BOM
                 sniffed_xml_encoding = 'utf-16le'
                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
@@ -1931,15 +1960,15 @@ class UnicodeDammit:
                 xml_encoding = sniffed_xml_encoding
         return xml_data, xml_encoding, sniffed_xml_encoding
 
-
     def find_codec(self, charset):
         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
-               or (charset and self._codec(charset.replace("-", ""))) \
-               or (charset and self._codec(charset.replace("-", "_"))) \
-               or charset
+            or (charset and self._codec(charset.replace("-", ""))) \
+            or (charset and self._codec(charset.replace("-", "_"))) \
+            or charset
 
     def _codec(self, charset):
-        if not charset: return charset
+        if not charset:
+            return charset
         codec = None
         try:
             codecs.lookup(charset)
@@ -1949,68 +1978,69 @@ class UnicodeDammit:
         return codec
 
     EBCDIC_TO_ASCII_MAP = None
+
     def _ebcdic_to_ascii(self, s):
         c = self.__class__
         if not c.EBCDIC_TO_ASCII_MAP:
-            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
-                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
-                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
-                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
-                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
-                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
-                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
-                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
-                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
-                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
-                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
-                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
-                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
-                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
-                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
-                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
-                    250,251,252,253,254,255)
+            emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
+                    128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
+                    144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
+                    32, 160, 161, 162, 163, 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33,
+                    38, 169, 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59, 94,
+                    45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 44, 37, 95, 62, 63,
+                    186, 187, 188, 189, 190, 191, 192, 193, 194, 96, 58, 35, 64, 39, 61, 34,
+                    195, 97, 98, 99, 100, 101, 102, 103, 104, 105, 196, 197, 198, 199, 200,
+                    201, 202, 106, 107, 108, 109, 110, 111, 112, 113, 114, 203, 204, 205,
+                    206, 207, 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210,
+                    211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+                    225, 226, 227, 228, 229, 230, 231, 123, 65, 66, 67, 68, 69, 70, 71, 72,
+                    73, 232, 233, 234, 235, 236, 237, 125, 74, 75, 76, 77, 78, 79, 80, 81,
+                    82, 238, 239, 240, 241, 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89,
+                    90, 244, 245, 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+                    250, 251, 252, 253, 254, 255)
             import string
-            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
-            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
+                ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
         return s.translate(c.EBCDIC_TO_ASCII_MAP)
 
-    MS_CHARS = { '\x80' : ('euro', '20AC'),
-                 '\x81' : ' ',
-                 '\x82' : ('sbquo', '201A'),
-                 '\x83' : ('fnof', '192'),
-                 '\x84' : ('bdquo', '201E'),
-                 '\x85' : ('hellip', '2026'),
-                 '\x86' : ('dagger', '2020'),
-                 '\x87' : ('Dagger', '2021'),
-                 '\x88' : ('circ', '2C6'),
-                 '\x89' : ('permil', '2030'),
-                 '\x8A' : ('Scaron', '160'),
-                 '\x8B' : ('lsaquo', '2039'),
-                 '\x8C' : ('OElig', '152'),
-                 '\x8D' : '?',
-                 '\x8E' : ('#x17D', '17D'),
-                 '\x8F' : '?',
-                 '\x90' : '?',
-                 '\x91' : ('lsquo', '2018'),
-                 '\x92' : ('rsquo', '2019'),
-                 '\x93' : ('ldquo', '201C'),
-                 '\x94' : ('rdquo', '201D'),
-                 '\x95' : ('bull', '2022'),
-                 '\x96' : ('ndash', '2013'),
-                 '\x97' : ('mdash', '2014'),
-                 '\x98' : ('tilde', '2DC'),
-                 '\x99' : ('trade', '2122'),
-                 '\x9a' : ('scaron', '161'),
-                 '\x9b' : ('rsaquo', '203A'),
-                 '\x9c' : ('oelig', '153'),
-                 '\x9d' : '?',
-                 '\x9e' : ('#x17E', '17E'),
-                 '\x9f' : ('Yuml', ''),}
+    MS_CHARS = {'\x80': ('euro', '20AC'),
+                '\x81': ' ',
+                '\x82': ('sbquo', '201A'),
+                '\x83': ('fnof', '192'),
+                '\x84': ('bdquo', '201E'),
+                '\x85': ('hellip', '2026'),
+                '\x86': ('dagger', '2020'),
+                '\x87': ('Dagger', '2021'),
+                '\x88': ('circ', '2C6'),
+                '\x89': ('permil', '2030'),
+                '\x8A': ('Scaron', '160'),
+                '\x8B': ('lsaquo', '2039'),
+                '\x8C': ('OElig', '152'),
+                '\x8D': '?',
+                '\x8E': ('#x17D', '17D'),
+                '\x8F': '?',
+                '\x90': '?',
+                '\x91': ('lsquo', '2018'),
+                '\x92': ('rsquo', '2019'),
+                '\x93': ('ldquo', '201C'),
+                '\x94': ('rdquo', '201D'),
+                '\x95': ('bull', '2022'),
+                '\x96': ('ndash', '2013'),
+                '\x97': ('mdash', '2014'),
+                '\x98': ('tilde', '2DC'),
+                '\x99': ('trade', '2122'),
+                '\x9a': ('scaron', '161'),
+                '\x9b': ('rsaquo', '203A'),
+                '\x9c': ('oelig', '153'),
+                '\x9d': '?',
+                '\x9e': ('#x17E', '17E'),
+                '\x9f': ('Yuml', ''), }
 
 #######################################################################
 
 
-#By default, act as an HTML pretty-printer.
+# By default, act as an HTML pretty-printer.
 if __name__ == '__main__':
     import sys
     soup = BeautifulSoup(sys.stdin)