aboutsummaryrefslogtreecommitdiff
path: root/cgi/BeautifulSoup.py
diff options
context:
space:
mode:
Diffstat (limited to 'cgi/BeautifulSoup.py')
-rw-r--r--cgi/BeautifulSoup.py480
1 files changed, 255 insertions, 225 deletions
diff --git a/cgi/BeautifulSoup.py b/cgi/BeautifulSoup.py
index 7278215..3e97785 100644
--- a/cgi/BeautifulSoup.py
+++ b/cgi/BeautifulSoup.py
@@ -90,26 +90,28 @@ import types
import re
import sgmllib
try:
- from htmlentitydefs import name2codepoint
+ from htmlentitydefs import name2codepoint
except ImportError:
- name2codepoint = {}
+ name2codepoint = {}
try:
set
except NameError:
from sets import Set as set
-#These hacks make Beautiful Soup able to parse XML with namespaces
+# These hacks make Beautiful Soup able to parse XML with namespaces
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
+
def _match_css_class(str):
"""Build a RE to match the given CSS class."""
return re.compile(r"(^|.*\s)%s($|\s)" % str)
# First, the classes that represent markup elements.
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -117,15 +119,15 @@ class PageElement(object):
def _invert(h):
"Cheap function to invert a hash."
i = {}
- for k,v in h.items():
+ for k, v in h.items():
i[v] = k
return i
- XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
- "quot" : '"',
- "amp" : "&",
- "lt" : "<",
- "gt" : ">" }
+ XML_ENTITIES_TO_SPECIAL_CHARS = {"apos": "'",
+ "quot": '"',
+ "amp": "&",
+ "lt": "<",
+ "gt": ">"}
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
@@ -145,7 +147,7 @@ class PageElement(object):
oldParent = self.parent
myIndex = self.parent.index(self)
if hasattr(replaceWith, "parent")\
- and replaceWith.parent is self.parent:
+ and replaceWith.parent is self.parent:
# We're replacing this element with one of its siblings.
index = replaceWith.parent.index(replaceWith)
if index and index < myIndex:
@@ -173,9 +175,9 @@ class PageElement(object):
except ValueError:
pass
- #Find the two elements that would be next to each other if
- #this element (and any children) hadn't been parsed. Connect
- #the two.
+ # Find the two elements that would be next to each other if
+ # this element (and any children) hadn't been parsed. Connect
+ # the two.
lastChild = self._lastRecursiveChild()
nextElement = lastChild.next
@@ -203,10 +205,10 @@ class PageElement(object):
def insert(self, position, newChild):
if isinstance(newChild, basestring) \
- and not isinstance(newChild, NavigableString):
+ and not isinstance(newChild, NavigableString):
newChild = NavigableString(newChild)
- position = min(position, len(self.contents))
+ position = min(position, len(self.contents))
if hasattr(newChild, 'parent') and newChild.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
@@ -243,7 +245,7 @@ class PageElement(object):
while not parentsNextSibling:
parentsNextSibling = parent.nextSibling
parent = parent.parent
- if not parent: # This is the last element in the document.
+ if not parent: # This is the last element in the document.
break
if parentsNextSibling:
newChildsLastElement.next = parentsNextSibling
@@ -288,7 +290,7 @@ class PageElement(object):
criteria and appear after this Tag in the document."""
return self._findAll(name, attrs, text, limit,
self.nextSiblingGenerator, **kwargs)
- fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
@@ -300,8 +302,8 @@ class PageElement(object):
"""Returns all items that match the given criteria and appear
before this Tag in the document."""
return self._findAll(name, attrs, text, limit, self.previousGenerator,
- **kwargs)
- fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+ **kwargs)
+ fetchPrevious = findAllPrevious # Compatibility with pre-3.x
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
@@ -315,7 +317,7 @@ class PageElement(object):
criteria and appear before this Tag in the document."""
return self._findAll(name, attrs, text, limit,
self.previousSiblingGenerator, **kwargs)
- fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+ fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
def findParent(self, name=None, attrs={}, **kwargs):
"""Returns the closest parent of this Tag that matches the given
@@ -334,9 +336,9 @@ class PageElement(object):
return self._findAll(name, attrs, None, limit, self.parentGenerator,
**kwargs)
- fetchParents = findParents # Compatibility with pre-3.x
+ fetchParents = findParents # Compatibility with pre-3.x
- #These methods do the real heavy lifting.
+ # These methods do the real heavy lifting.
def _findOne(self, method, name, attrs, text, **kwargs):
r = None
@@ -381,8 +383,8 @@ class PageElement(object):
break
return results
- #These Generators can be used to navigate starting from both
- #NavigableStrings and Tags.
+ # These Generators can be used to navigate starting from both
+ # NavigableStrings and Tags.
def nextGenerator(self):
i = self
while i is not None:
@@ -431,7 +433,7 @@ class PageElement(object):
s = unicode(s)
else:
if encoding:
- s = self.toEncoding(str(s), encoding)
+ s = self.toEncoding(str(s), encoding)
else:
s = unicode(s)
return s
@@ -483,11 +485,13 @@ class NavigableString(unicode, PageElement):
else:
return data
+
class CData(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+
class ProcessingInstruction(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
output = self
@@ -495,14 +499,17 @@ class ProcessingInstruction(NavigableString):
output = self.substituteEncoding(output, encoding)
return "<?%s?>" % self.toEncoding(output, encoding)
+
class Comment(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<!--%s-->" % NavigableString.__str__(self, encoding)
+
class Declaration(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<!%s>" % NavigableString.__str__(self, encoding)
+
class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
@@ -555,15 +562,15 @@ class Tag(PageElement):
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
# Convert any HTML, XML, or numeric entities in the attribute values.
- convert = lambda(k, val): (k,
- re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
- self._convertEntities,
- val))
+ def convert((k, val)): return (k,
+ re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities,
+ val))
self.attrs = map(convert, self.attrs)
def getString(self):
if (len(self.contents) == 1
- and isinstance(self.contents[0], NavigableString)):
+ and isinstance(self.contents[0], NavigableString)):
return self.contents[0]
def setString(self, string):
@@ -646,8 +653,8 @@ class Tag(PageElement):
for item in self.attrs:
if item[0] == key:
self.attrs.remove(item)
- #We don't break because bad HTML can define the same
- #attribute multiple times.
+ # We don't break because bad HTML can define the same
+ # attribute multiple times.
self._getAttrMap()
if self.attrMap.has_key(key):
del self.attrMap[key]
@@ -659,7 +666,7 @@ class Tag(PageElement):
return apply(self.findAll, args, kwargs)
def __getattr__(self, tag):
- #print "Getattr %s.%s" % (self.__class__, tag)
+ # print "Getattr %s.%s" % (self.__class__, tag)
if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
return self.find(tag[:-3])
elif tag.find('__') != 0:
@@ -738,7 +745,8 @@ class Tag(PageElement):
# value might also contain angle brackets, or
# ampersands that aren't part of entities. We need
# to escape those to XML entities too.
- val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(
+ self._sub_entity, val)
attrs.append(fmt % (self.toEncoding(key, encoding),
self.toEncoding(val, encoding)))
@@ -802,7 +810,7 @@ class Tag(PageElement):
prettyPrint=False, indentLevel=0):
"""Renders the contents of this tag as a string in the given
encoding. If encoding is None, returns a Unicode string.."""
- s=[]
+ s = []
for c in self:
text = None
if isinstance(c, NavigableString):
@@ -819,7 +827,7 @@ class Tag(PageElement):
s.append("\n")
return ''.join(s)
- #Soup methods
+ # Soup methods
def find(self, name=None, attrs={}, recursive=True, text=None,
**kwargs):
@@ -859,7 +867,7 @@ class Tag(PageElement):
def firstText(self, text=None, recursive=True):
return self.find(text=text, recursive=recursive)
- #Private methods
+ # Private methods
def _getAttrMap(self):
"""Initializes a map representation of this tag's attributes,
@@ -870,7 +878,7 @@ class Tag(PageElement):
self.attrMap[key] = value
return self.attrMap
- #Generator methods
+ # Generator methods
def childGenerator(self):
# Just use the iterator from the contents
return iter(self.contents)
@@ -917,12 +925,12 @@ class SoupStrainer:
markup = markupName
markupAttrs = markup
callFunctionWithTagData = callable(self.name) \
- and not isinstance(markupName, Tag)
+ and not isinstance(markupName, Tag)
if (not self.name) \
- or callFunctionWithTagData \
- or (markup and self._matches(markup, self.name)) \
- or (not markup and self._matches(markupName, self.name)):
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
if callFunctionWithTagData:
match = self.name(markupName, markupAttrs)
else:
@@ -930,11 +938,11 @@ class SoupStrainer:
markupAttrMap = None
for attr, matchAgainst in self.attrs.items():
if not markupAttrMap:
- if hasattr(markupAttrs, 'get'):
+ if hasattr(markupAttrs, 'get'):
markupAttrMap = markupAttrs
- else:
+ else:
markupAttrMap = {}
- for k,v in markupAttrs:
+ for k, v in markupAttrs:
markupAttrMap[k] = v
attrValue = markupAttrMap.get(attr)
if not self._matches(attrValue, matchAgainst):
@@ -948,7 +956,7 @@ class SoupStrainer:
return found
def search(self, markup):
- #print 'looking for %s in %s' % (self, markup)
+ # print 'looking for %s in %s' % (self, markup)
found = None
# If given a list of items, scan it for a text element that
# matches.
@@ -956,7 +964,7 @@ class SoupStrainer:
and not isinstance(markup, Tag):
for element in markup:
if isinstance(element, NavigableString) \
- and self.search(element):
+ and self.search(element):
found = element
break
# If it's a Tag, make sure its name or attributes match.
@@ -966,33 +974,33 @@ class SoupStrainer:
found = self.searchTag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
- isinstance(markup, basestring):
+ isinstance(markup, basestring):
if self._matches(markup, self.text):
found = markup
else:
raise Exception, "I don't know how to match against a %s" \
- % markup.__class__
+ % markup.__class__
return found
def _matches(self, markup, matchAgainst):
- #print "Matching %s against %s" % (markup, matchAgainst)
+ # print "Matching %s against %s" % (markup, matchAgainst)
result = False
if matchAgainst is True:
result = markup is not None
elif callable(matchAgainst):
result = matchAgainst(markup)
else:
- #Custom match methods take the tag as an argument, but all
- #other ways of matching match the tag name as a string.
+ # Custom match methods take the tag as an argument, but all
+ # other ways of matching match the tag name as a string.
if isinstance(markup, Tag):
markup = markup.name
if markup and not isinstance(markup, basestring):
markup = unicode(markup)
- #Now we know that chunk is either a string, or None.
+ # Now we know that chunk is either a string, or None.
if hasattr(matchAgainst, 'match'):
# It's a regexp object.
result = markup and matchAgainst.search(markup)
- elif hasattr(matchAgainst, '__iter__'): # list-like
+ elif hasattr(matchAgainst, '__iter__'): # list-like
result = markup in matchAgainst
elif hasattr(matchAgainst, 'items'):
result = markup.has_key(matchAgainst)
@@ -1006,15 +1014,18 @@ class SoupStrainer:
result = matchAgainst == markup
return result
+
class ResultSet(list):
"""A ResultSet is just a list that keeps track of the SoupStrainer
that created it."""
+
def __init__(self, source):
list.__init__([])
self.source = source
# Now, some helper functions.
+
def buildTagMap(default, *args):
"""Turns a list of maps, lists, or scalars into a single map.
Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
@@ -1022,20 +1033,21 @@ def buildTagMap(default, *args):
built = {}
for portion in args:
if hasattr(portion, 'items'):
- #It's a map. Merge it.
- for k,v in portion.items():
+ # It's a map. Merge it.
+ for k, v in portion.items():
built[k] = v
- elif hasattr(portion, '__iter__'): # is a list
- #It's a list. Map each item to the default.
+ elif hasattr(portion, '__iter__'): # is a list
+ # It's a list. Map each item to the default.
for k in portion:
built[k] = default
else:
- #It's a scalar. Map it to the default.
+ # It's a scalar. Map it to the default.
built[portion] = default
return built
# Now, the parser classes.
+
class BeautifulStoneSoup(Tag, SGMLParser):
"""This class contains the basic parser and search code. It defines
@@ -1078,7 +1090,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
# alone.
- STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+ STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
markupMassage=True, smartQuotesTo=XML_ENTITIES,
@@ -1155,7 +1167,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
n = int(name)
except ValueError:
return
- if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ if not 0 <= n <= 127: # ASCII ends at 127, not 255
return
return self.convert_codepoint(n)
@@ -1166,9 +1178,8 @@ class BeautifulStoneSoup(Tag, SGMLParser):
if not hasattr(self, 'originalEncoding'):
self.originalEncoding = None
else:
- dammit = UnicodeDammit\
- (markup, [self.fromEncoding, inDocumentEncoding],
- smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+ dammit = UnicodeDammit(markup, [self.fromEncoding, inDocumentEncoding],
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
markup = dammit.unicode
self.originalEncoding = dammit.originalEncoding
self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
@@ -1195,10 +1206,10 @@ class BeautifulStoneSoup(Tag, SGMLParser):
def __getattr__(self, methodName):
"""This method routes method call requests to either the SGMLParser
superclass or the Tag superclass, depending on the method name."""
- #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+ # print "__getattr__ called on %s.%s" % (self.__class__, methodName)
if methodName.startswith('start_') or methodName.startswith('end_') \
- or methodName.startswith('do_'):
+ or methodName.startswith('do_'):
return SGMLParser.__getattr__(self, methodName)
elif not methodName.startswith('__'):
return Tag.__getattr__(self, methodName)
@@ -1209,7 +1220,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
"""Returns true iff the given string is the name of a
self-closing tag according to this parser."""
return self.SELF_CLOSING_TAGS.has_key(name) \
- or self.instanceSelfClosingTags.has_key(name)
+ or self.instanceSelfClosingTags.has_key(name)
def reset(self):
Tag.__init__(self, self, self.ROOT_TAG_NAME)
@@ -1224,13 +1235,13 @@ class BeautifulStoneSoup(Tag, SGMLParser):
def popTag(self):
tag = self.tagStack.pop()
- #print "Pop", tag.name
+ # print "Pop", tag.name
if self.tagStack:
self.currentTag = self.tagStack[-1]
return self.currentTag
def pushTag(self, tag):
- #print "Push", tag.name
+ # print "Push", tag.name
if self.currentTag:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
@@ -1248,7 +1259,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
currentData = ' '
self.currentData = []
if self.parseOnlyThese and len(self.tagStack) <= 1 and \
- (not self.parseOnlyThese.text or \
+ (not self.parseOnlyThese.text or
not self.parseOnlyThese.search(currentData)):
return
o = containerClass(currentData)
@@ -1258,13 +1269,12 @@ class BeautifulStoneSoup(Tag, SGMLParser):
self.previous = o
self.currentTag.contents.append(o)
-
def _popToTag(self, name, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
stack up to but *not* including the most recent instqance of
the given tag."""
- #print "Popping to %s" % name
+ # print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
return
@@ -1282,7 +1292,6 @@ class BeautifulStoneSoup(Tag, SGMLParser):
return mostRecentTag
def _smartPop(self, name):
-
"""We need to pop up to the previous tag of this type, unless
one of this tag's nesting reset triggers comes between this
tag and the previous tag of this type, OR unless this tag is a
@@ -1307,8 +1316,8 @@ class BeautifulStoneSoup(Tag, SGMLParser):
for i in range(len(self.tagStack)-1, 0, -1):
p = self.tagStack[i]
if (not p or p.name == name) and not isNestable:
- #Non-nestable tags get popped to the top or to their
- #last occurance.
+ # Non-nestable tags get popped to the top or to their
+ # last occurance.
popTo = name
break
if (nestingResetTriggers is not None
@@ -1316,10 +1325,10 @@ class BeautifulStoneSoup(Tag, SGMLParser):
or (nestingResetTriggers is None and isResetNesting
and self.RESET_NESTING_TAGS.has_key(p.name)):
- #If we encounter one of the nesting reset triggers
- #peculiar to this tag, or we encounter another tag
- #that causes nesting to reset, pop up to but not
- #including that tag.
+ # If we encounter one of the nesting reset triggers
+ # peculiar to this tag, or we encounter another tag
+ # that causes nesting to reset, pop up to but not
+ # including that tag.
popTo = p.name
inclusive = False
break
@@ -1328,10 +1337,10 @@ class BeautifulStoneSoup(Tag, SGMLParser):
self._popToTag(popTo, inclusive)
def unknown_starttag(self, name, attrs, selfClosing=0):
- #print "Start tag %s: %s" % (name, attrs)
+ # print "Start tag %s: %s" % (name, attrs)
if self.quoteStack:
- #This is not a real tag.
- #print "<%s> is not real!" % name
+ # This is not a real tag.
+ # print "<%s> is not real!" % name
attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
self.handle_data('<%s%s>' % (name, attrs))
return
@@ -1341,7 +1350,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
self._smartPop(name)
if self.parseOnlyThese and len(self.tagStack) <= 1 \
- and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
return
tag = Tag(self, name, attrs, self.currentTag, self.previous)
@@ -1352,16 +1361,16 @@ class BeautifulStoneSoup(Tag, SGMLParser):
if selfClosing or self.isSelfClosingTag(name):
self.popTag()
if name in self.QUOTE_TAGS:
- #print "Beginning quote (%s)" % name
+ # print "Beginning quote (%s)" % name
self.quoteStack.append(name)
self.literal = 1
return tag
def unknown_endtag(self, name):
- #print "End tag %s" % name
+ # print "End tag %s" % name
if self.quoteStack and self.quoteStack[-1] != name:
- #This is not a real end tag.
- #print "</%s> is not real!" % name
+ # This is not a real end tag.
+ # print "</%s> is not real!" % name
self.handle_data('</%s>' % name)
return
self.endData()
@@ -1412,27 +1421,27 @@ class BeautifulStoneSoup(Tag, SGMLParser):
pass
if not data and self.convertXMLEntities:
- data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
if not data and self.convertHTMLEntities and \
- not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
- # TODO: We've got a problem here. We're told this is
- # an entity reference, but it's not an XML entity
- # reference or an HTML entity reference. Nonetheless,
- # the logical thing to do is to pass it through as an
- # unrecognized entity reference.
- #
- # Except: when the input is "&carol;" this function
- # will be called with input "carol". When the input is
- # "AT&T", this function will be called with input
- # "T". We have no way of knowing whether a semicolon
- # was present originally, so we don't know whether
- # this is an unknown entity or just a misplaced
- # ampersand.
- #
- # The more common case is a misplaced ampersand, so I
- # escape the ampersand and omit the trailing semicolon.
- data = "&amp;%s" % ref
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&amp;%s" % ref
if not data:
# This case is different from the one above, because we
# haven't already gone through a supposedly comprehensive
@@ -1452,12 +1461,12 @@ class BeautifulStoneSoup(Tag, SGMLParser):
declaration as a CData object."""
j = None
if self.rawdata[i:i+9] == '<![CDATA[':
- k = self.rawdata.find(']]>', i)
- if k == -1:
- k = len(self.rawdata)
- data = self.rawdata[i+9:k]
- j = k+3
- self._toStringSubclass(data, CData)
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
else:
try:
j = SGMLParser.parse_declaration(self, i)
@@ -1467,6 +1476,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
j = i + len(toHandle)
return j
+
class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML:
@@ -1522,46 +1532,46 @@ class BeautifulSoup(BeautifulStoneSoup):
BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None,
- ('br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base', 'col'))
+ ('br', 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base', 'col'))
PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
- QUOTE_TAGS = {'script' : None, 'textarea' : None}
+ QUOTE_TAGS = {'script': None, 'textarea': None}
- #According to the HTML standard, each of these inline tags can
- #contain another tag of the same type. Furthermore, it's common
- #to actually use these tags this way.
+ # According to the HTML standard, each of these inline tags can
+ # contain another tag of the same type. Furthermore, it's common
+ # to actually use these tags this way.
NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
'center')
- #According to the HTML standard, these block tags can contain
- #another tag of the same type. Furthermore, it's common
- #to actually use these tags this way.
+ # According to the HTML standard, these block tags can contain
+ # another tag of the same type. Furthermore, it's common
+ # to actually use these tags this way.
NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
- #Lists can contain other lists, but there are restrictions.
- NESTABLE_LIST_TAGS = { 'ol' : [],
- 'ul' : [],
- 'li' : ['ul', 'ol'],
- 'dl' : [],
- 'dd' : ['dl'],
- 'dt' : ['dl'] }
-
- #Tables can contain other tables, but there are restrictions.
- NESTABLE_TABLE_TAGS = {'table' : [],
- 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
- 'td' : ['tr'],
- 'th' : ['tr'],
- 'thead' : ['table'],
- 'tbody' : ['table'],
- 'tfoot' : ['table'],
+ # Lists can contain other lists, but there are restrictions.
+ NESTABLE_LIST_TAGS = {'ol': [],
+ 'ul': [],
+ 'li': ['ul', 'ol'],
+ 'dl': [],
+ 'dd': ['dl'],
+ 'dt': ['dl']}
+
+ # Tables can contain other tables, but there are restrictions.
+ NESTABLE_TABLE_TAGS = {'table': [],
+ 'tr': ['table', 'tbody', 'tfoot', 'thead'],
+ 'td': ['tr'],
+ 'th': ['tr'],
+ 'thead': ['table'],
+ 'tbody': ['table'],
+ 'tfoot': ['table'],
}
NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
- #If one of these tags is encountered, all tags up to the next tag of
- #this type are popped.
+ # If one of these tags is encountered, all tags up to the next tag of
+ # this type are popped.
RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
NON_NESTABLE_BLOCK_TAGS,
NESTABLE_LIST_TAGS,
@@ -1591,11 +1601,11 @@ class BeautifulSoup(BeautifulStoneSoup):
contentType = value
contentTypeIndex = i
- if httpEquiv and contentType: # It's an interesting meta tag.
+ if httpEquiv and contentType: # It's an interesting meta tag.
match = self.CHARSET_RE.search(contentType)
if match:
if (self.declaredHTMLEncoding is not None or
- self.originalEncoding == self.fromEncoding):
+ self.originalEncoding == self.fromEncoding):
# An HTML encoding was sniffed while converting
# the document to Unicode, or an HTML encoding was
# sniffed during a previous pass through the
@@ -1620,9 +1630,11 @@ class BeautifulSoup(BeautifulStoneSoup):
if tag and tagNeedsEncodingSubstitution:
tag.containsSubstitutions = True
+
class StopParsing(Exception):
pass
+
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
"""The BeautifulSoup class is oriented towards skipping over
@@ -1649,9 +1661,9 @@ class ICantBelieveItsBeautifulSoup(BeautifulSoup):
wouldn't be."""
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
- ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
- 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
- 'big')
+ ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+ 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+ 'big')
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
@@ -1659,6 +1671,7 @@ class ICantBelieveItsBeautifulSoup(BeautifulSoup):
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
class MinimalSoup(BeautifulSoup):
"""The MinimalSoup class is for parsing HTML that contains
pathologically bad markup. It makes no assumptions about tag
@@ -1672,6 +1685,7 @@ class MinimalSoup(BeautifulSoup):
RESET_NESTING_TAGS = buildTagMap('noscript')
NESTABLE_TAGS = {}
+
class BeautifulSOAP(BeautifulStoneSoup):
"""This class will push a tag with only a single string child into
the tag's parent as an attribute. The attribute's name is the tag
@@ -1699,26 +1713,36 @@ class BeautifulSOAP(BeautifulStoneSoup):
parent._getAttrMap()
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
isinstance(tag.contents[0], NavigableString) and
- not parent.attrMap.has_key(tag.name)):
+ not parent.attrMap.has_key(tag.name)):
parent[tag.name] = tag.contents[0]
BeautifulStoneSoup.popTag(self)
-#Enterprise class names! It has come to our attention that some people
-#think the names of the Beautiful Soup parser classes are too silly
-#and "unprofessional" for use in enterprise screen-scraping. We feel
-#your pain! For such-minded folk, the Beautiful Soup Consortium And
-#All-Night Kosher Bakery recommends renaming this file to
-#"RobustParser.py" (or, in cases of extreme enterprisiness,
-#"RobustParserBeanInterface.class") and using the following
-#enterprise-friendly class aliases:
+# Enterprise class names! It has come to our attention that some people
+# think the names of the Beautiful Soup parser classes are too silly
+# and "unprofessional" for use in enterprise screen-scraping. We feel
+# your pain! For such-minded folk, the Beautiful Soup Consortium And
+# All-Night Kosher Bakery recommends renaming this file to
+# "RobustParser.py" (or, in cases of extreme enterprisiness,
+# "RobustParserBeanInterface.class") and using the following
+# enterprise-friendly class aliases:
+
+
class RobustXMLParser(BeautifulStoneSoup):
pass
+
+
class RobustHTMLParser(BeautifulSoup):
pass
+
+
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
pass
+
+
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
pass
+
+
class SimplifyingSOAPParser(BeautifulSOAP):
pass
@@ -1732,6 +1756,7 @@ class SimplifyingSOAPParser(BeautifulSOAP):
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
# (XML) and BeautifulSoup.start_meta (HTML).
+
# Autodetects character encodings.
# Download from http://chardet.feedparser.org/
try:
@@ -1753,6 +1778,7 @@ try:
except ImportError:
pass
+
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
@@ -1763,14 +1789,14 @@ class UnicodeDammit:
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
- CHARSET_ALIASES = { "macintosh" : "mac-roman",
- "x-sjis" : "shift-jis" }
+ CHARSET_ALIASES = {"macintosh": "mac-roman",
+ "x-sjis": "shift-jis"}
def __init__(self, markup, overrideEncodings=[],
smartQuotesTo='xml', isHTML=False):
self.declaredHTMLEncoding = None
self.markup, documentEncoding, sniffedEncoding = \
- self._detectEncoding(markup, isHTML)
+ self._detectEncoding(markup, isHTML)
self.smartQuotesTo = smartQuotesTo
self.triedEncodings = []
if markup == '' or isinstance(markup, unicode):
@@ -1781,11 +1807,13 @@ class UnicodeDammit:
u = None
for proposedEncoding in overrideEncodings:
u = self._convertFrom(proposedEncoding)
- if u: break
+ if u:
+ break
if not u:
for proposedEncoding in (documentEncoding, sniffedEncoding):
u = self._convertFrom(proposedEncoding)
- if u: break
+ if u:
+ break
# If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode):
@@ -1795,10 +1823,12 @@ class UnicodeDammit:
if not u:
for proposed_encoding in ("utf-8", "windows-1252"):
u = self._convertFrom(proposed_encoding)
- if u: break
+ if u:
+ break
self.unicode = u
- if not u: self.originalEncoding = None
+ if not u:
+ self.originalEncoding = None
def _subMSChar(self, orig):
"""Changes a MS smart quote character to an XML or HTML
@@ -1823,9 +1853,8 @@ class UnicodeDammit:
if self.smartQuotesTo and proposed.lower() in("windows-1252",
"iso-8859-1",
"iso-8859-2"):
- markup = re.compile("([\x80-\x9f])").sub \
- (lambda(x): self._subMSChar(x.group(1)),
- markup)
+ markup = re.compile("([\x80-\x9f])").sub(lambda(x): self._subMSChar(x.group(1)),
+ markup)
try:
# print "Trying to convert document to %s" % proposed
@@ -1836,7 +1865,7 @@ class UnicodeDammit:
# print "That didn't work!"
# print e
return None
- #print "Correct encoding: %s" % proposed
+ # print "Correct encoding: %s" % proposed
return self.markup
def _toUnicode(self, data, encoding):
@@ -1845,11 +1874,11 @@ class UnicodeDammit:
# strip Byte Order Mark (if present)
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
- and (data[2:4] != '\x00\x00'):
+ and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
data = data[2:]
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
- and (data[2:4] != '\x00\x00'):
+ and (data[2:4] != '\x00\x00'):
encoding = 'utf-16le'
data = data[2:]
elif data[:3] == '\xef\xbb\xbf':
@@ -1876,7 +1905,7 @@ class UnicodeDammit:
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
- and (xml_data[2:4] != '\x00\x00'):
+ and (xml_data[2:4] != '\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
@@ -1885,7 +1914,7 @@ class UnicodeDammit:
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
- (xml_data[2:4] != '\x00\x00'):
+ (xml_data[2:4] != '\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
@@ -1931,15 +1960,15 @@ class UnicodeDammit:
xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding
-
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
- or (charset and self._codec(charset.replace("-", ""))) \
- or (charset and self._codec(charset.replace("-", "_"))) \
- or charset
+ or (charset and self._codec(charset.replace("-", ""))) \
+ or (charset and self._codec(charset.replace("-", "_"))) \
+ or charset
def _codec(self, charset):
- if not charset: return charset
+ if not charset:
+ return charset
codec = None
try:
codecs.lookup(charset)
@@ -1949,68 +1978,69 @@ class UnicodeDammit:
return codec
EBCDIC_TO_ASCII_MAP = None
+
def _ebcdic_to_ascii(self, s):
c = self.__class__
if not c.EBCDIC_TO_ASCII_MAP:
- emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
- 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
- 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
- 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
- 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
- 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
- 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
- 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
- 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
- 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
- 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
- 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
- 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
- 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
- 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
- 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
- 250,251,252,253,254,255)
+ emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
+ 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
+ 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
+ 32, 160, 161, 162, 163, 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33,
+ 38, 169, 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59, 94,
+ 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 44, 37, 95, 62, 63,
+ 186, 187, 188, 189, 190, 191, 192, 193, 194, 96, 58, 35, 64, 39, 61, 34,
+ 195, 97, 98, 99, 100, 101, 102, 103, 104, 105, 196, 197, 198, 199, 200,
+ 201, 202, 106, 107, 108, 109, 110, 111, 112, 113, 114, 203, 204, 205,
+ 206, 207, 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210,
+ 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 123, 65, 66, 67, 68, 69, 70, 71, 72,
+ 73, 232, 233, 234, 235, 236, 237, 125, 74, 75, 76, 77, 78, 79, 80, 81,
+ 82, 238, 239, 240, 241, 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89,
+ 90, 244, 245, 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+ 250, 251, 252, 253, 254, 255)
import string
- c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
- ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans(
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
- MS_CHARS = { '\x80' : ('euro', '20AC'),
- '\x81' : ' ',
- '\x82' : ('sbquo', '201A'),
- '\x83' : ('fnof', '192'),
- '\x84' : ('bdquo', '201E'),
- '\x85' : ('hellip', '2026'),
- '\x86' : ('dagger', '2020'),
- '\x87' : ('Dagger', '2021'),
- '\x88' : ('circ', '2C6'),
- '\x89' : ('permil', '2030'),
- '\x8A' : ('Scaron', '160'),
- '\x8B' : ('lsaquo', '2039'),
- '\x8C' : ('OElig', '152'),
- '\x8D' : '?',
- '\x8E' : ('#x17D', '17D'),
- '\x8F' : '?',
- '\x90' : '?',
- '\x91' : ('lsquo', '2018'),
- '\x92' : ('rsquo', '2019'),
- '\x93' : ('ldquo', '201C'),
- '\x94' : ('rdquo', '201D'),
- '\x95' : ('bull', '2022'),
- '\x96' : ('ndash', '2013'),
- '\x97' : ('mdash', '2014'),
- '\x98' : ('tilde', '2DC'),
- '\x99' : ('trade', '2122'),
- '\x9a' : ('scaron', '161'),
- '\x9b' : ('rsaquo', '203A'),
- '\x9c' : ('oelig', '153'),
- '\x9d' : '?',
- '\x9e' : ('#x17E', '17E'),
- '\x9f' : ('Yuml', ''),}
+ MS_CHARS = {'\x80': ('euro', '20AC'),
+ '\x81': ' ',
+ '\x82': ('sbquo', '201A'),
+ '\x83': ('fnof', '192'),
+ '\x84': ('bdquo', '201E'),
+ '\x85': ('hellip', '2026'),
+ '\x86': ('dagger', '2020'),
+ '\x87': ('Dagger', '2021'),
+ '\x88': ('circ', '2C6'),
+ '\x89': ('permil', '2030'),
+ '\x8A': ('Scaron', '160'),
+ '\x8B': ('lsaquo', '2039'),
+ '\x8C': ('OElig', '152'),
+ '\x8D': '?',
+ '\x8E': ('#x17D', '17D'),
+ '\x8F': '?',
+ '\x90': '?',
+ '\x91': ('lsquo', '2018'),
+ '\x92': ('rsquo', '2019'),
+ '\x93': ('ldquo', '201C'),
+ '\x94': ('rdquo', '201D'),
+ '\x95': ('bull', '2022'),
+ '\x96': ('ndash', '2013'),
+ '\x97': ('mdash', '2014'),
+ '\x98': ('tilde', '2DC'),
+ '\x99': ('trade', '2122'),
+ '\x9a': ('scaron', '161'),
+ '\x9b': ('rsaquo', '203A'),
+ '\x9c': ('oelig', '153'),
+ '\x9d': '?',
+ '\x9e': ('#x17E', '17E'),
+ '\x9f': ('Yuml', ''), }
#######################################################################
-#By default, act as an HTML pretty-printer.
+# By default, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)