From a16f23f034fb942b44e0665224d499af57aec1ad Mon Sep 17 00:00:00 2001 From: neptune Date: Mon, 15 Nov 2021 13:35:08 -0300 Subject: Nueva revisiĆ³n Py3 --- cgi/BeautifulSoup.py | 2047 ---------------------------- cgi/api.py | 147 +- cgi/fcgi.py | 1363 ------------------- cgi/formatting.py | 29 +- cgi/framework.py | 40 +- cgi/manage.py | 150 +-- cgi/markdown.py | 2093 ----------------------------- cgi/post.py | 31 +- cgi/templates/bans_geo | 2 +- cgi/templates/bans_locations | 2 +- cgi/templates/manage/boardoptions.html | 2 +- cgi/templates/mobile/txt_thread.html | 10 +- cgi/templates/revision.html | 2 +- cgi/templates/txt_archive.html | 4 +- cgi/templates/txt_thread.en.html | 10 +- cgi/tenjin.py | 2293 -------------------------------- cgi/weabot.py | 35 +- 17 files changed, 235 insertions(+), 8025 deletions(-) delete mode 100644 cgi/BeautifulSoup.py delete mode 100644 cgi/fcgi.py delete mode 100644 cgi/markdown.py delete mode 100644 cgi/tenjin.py diff --git a/cgi/BeautifulSoup.py b/cgi/BeautifulSoup.py deleted file mode 100644 index 3e97785..0000000 --- a/cgi/BeautifulSoup.py +++ /dev/null @@ -1,2047 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup parses a (possibly invalid) XML or HTML document into a -tree representation. It provides methods and Pythonic idioms that make -it easy to navigate, search, and modify the tree. - -A well-formed XML/HTML document yields a well-formed data -structure. An ill-formed XML/HTML document yields a correspondingly -ill-formed data structure. If your document is only locally -well-formed, you can use this library to find and process the -well-formed part of it. - -Beautiful Soup works with Python 2.2 and up. It has no external -dependencies, but you'll have more success at converting data to UTF-8 -if you also install these three packages: - -* chardet, for auto-detecting character encodings - http://chardet.feedparser.org/ -* cjkcodecs and iconv_codec, which add more encodings to the ones supported - by stock Python. - http://cjkpython.i18n.org/ - -Beautiful Soup defines classes for two main parsing strategies: - - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific - language that kind of looks like XML. - - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid - or invalid. This class has web browser-like heuristics for - obtaining a sensible parse tree in the face of common HTML errors. - -Beautiful Soup also defines a class (UnicodeDammit) for autodetecting -the encoding of an HTML or XML document, and converting it to -Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/documentation.html - -Here, have some legalese: - -Copyright (c) 2004-2010, Leonard Richardson - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the the Beautiful Soup Consortium and All - Night Kosher Bakery nor the names of its contributors may be - used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. - -""" -from __future__ import generators - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.2.1" -__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" -__license__ = "New-style BSD" - -from sgmllib import SGMLParser, SGMLParseError -import codecs -import markupbase -import types -import re -import sgmllib -try: - from htmlentitydefs import name2codepoint -except ImportError: - name2codepoint = {} -try: - set -except NameError: - from sets import Set as set - -# These hacks make Beautiful Soup able to parse XML with namespaces -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') -markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match - -DEFAULT_OUTPUT_ENCODING = "utf-8" - - -def _match_css_class(str): - """Build a RE to match the given CSS class.""" - return re.compile(r"(^|.*\s)%s($|\s)" % str) - -# First, the classes that represent markup elements. - - -class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" - - def _invert(h): - "Cheap function to invert a hash." - i = {} - for k, v in h.items(): - i[v] = k - return i - - XML_ENTITIES_TO_SPECIAL_CHARS = {"apos": "'", - "quot": '"', - "amp": "&", - "lt": "<", - "gt": ">"} - - XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) - - def setup(self, parent=None, previous=None): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - self.previous = previous - self.next = None - self.previousSibling = None - self.nextSibling = None - if self.parent and self.parent.contents: - self.previousSibling = self.parent.contents[-1] - self.previousSibling.nextSibling = self - - def replaceWith(self, replaceWith): - oldParent = self.parent - myIndex = self.parent.index(self) - if hasattr(replaceWith, "parent")\ - and replaceWith.parent is self.parent: - # We're replacing this element with one of its siblings. - index = replaceWith.parent.index(replaceWith) - if index and index < myIndex: - # Furthermore, it comes before this element. That - # means that when we extract it, the index of this - # element will change. - myIndex = myIndex - 1 - self.extract() - oldParent.insert(myIndex, replaceWith) - - def replaceWithChildren(self): - myParent = self.parent - myIndex = self.parent.index(self) - self.extract() - reversedChildren = list(self.contents) - reversedChildren.reverse() - for child in reversedChildren: - myParent.insert(myIndex, child) - - def extract(self): - """Destructively rips this element out of the tree.""" - if self.parent: - try: - del self.parent.contents[self.parent.index(self)] - except ValueError: - pass - - # Find the two elements that would be next to each other if - # this element (and any children) hadn't been parsed. Connect - # the two. - lastChild = self._lastRecursiveChild() - nextElement = lastChild.next - - if self.previous: - self.previous.next = nextElement - if nextElement: - nextElement.previous = self.previous - self.previous = None - lastChild.next = None - - self.parent = None - if self.previousSibling: - self.previousSibling.nextSibling = self.nextSibling - if self.nextSibling: - self.nextSibling.previousSibling = self.previousSibling - self.previousSibling = self.nextSibling = None - return self - - def _lastRecursiveChild(self): - "Finds the last element beneath this object to be parsed." - lastChild = self - while hasattr(lastChild, 'contents') and lastChild.contents: - lastChild = lastChild.contents[-1] - return lastChild - - def insert(self, position, newChild): - if isinstance(newChild, basestring) \ - and not isinstance(newChild, NavigableString): - newChild = NavigableString(newChild) - - position = min(position, len(self.contents)) - if hasattr(newChild, 'parent') and newChild.parent is not None: - # We're 'inserting' an element that's already one - # of this object's children. - if newChild.parent is self: - index = self.index(newChild) - if index > position: - # Furthermore we're moving it further down the - # list of this object's children. That means that - # when we extract this element, our target index - # will jump down one. - position = position - 1 - newChild.extract() - - newChild.parent = self - previousChild = None - if position == 0: - newChild.previousSibling = None - newChild.previous = self - else: - previousChild = self.contents[position-1] - newChild.previousSibling = previousChild - newChild.previousSibling.nextSibling = newChild - newChild.previous = previousChild._lastRecursiveChild() - if newChild.previous: - newChild.previous.next = newChild - - newChildsLastElement = newChild._lastRecursiveChild() - - if position >= len(self.contents): - newChild.nextSibling = None - - parent = self - parentsNextSibling = None - while not parentsNextSibling: - parentsNextSibling = parent.nextSibling - parent = parent.parent - if not parent: # This is the last element in the document. - break - if parentsNextSibling: - newChildsLastElement.next = parentsNextSibling - else: - newChildsLastElement.next = None - else: - nextChild = self.contents[position] - newChild.nextSibling = nextChild - if newChild.nextSibling: - newChild.nextSibling.previousSibling = newChild - newChildsLastElement.next = nextChild - - if newChildsLastElement.next: - newChildsLastElement.next.previous = newChildsLastElement - self.contents.insert(position, newChild) - - def append(self, tag): - """Appends the given tag to the contents of this tag.""" - self.insert(len(self.contents), tag) - - def findNext(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" - return self._findOne(self.findAllNext, name, attrs, text, **kwargs) - - def findAllNext(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.nextGenerator, - **kwargs) - - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" - return self._findOne(self.findNextSiblings, name, attrs, text, - **kwargs) - - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.nextSiblingGenerator, **kwargs) - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x - - def findPrevious(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" - return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) - - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.previousGenerator, - **kwargs) - fetchPrevious = findAllPrevious # Compatibility with pre-3.x - - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" - return self._findOne(self.findPreviousSiblings, name, attrs, text, - **kwargs) - - def findPreviousSiblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x - - def findParent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" - # NOTE: We can't use _findOne because findParents takes a different - # set of arguments. - r = None - l = self.findParents(name, attrs, 1) - if l: - r = l[0] - return r - - def findParents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" - - return self._findAll(name, attrs, None, limit, self.parentGenerator, - **kwargs) - fetchParents = findParents # Compatibility with pre-3.x - - # These methods do the real heavy lifting. - - def _findOne(self, method, name, attrs, text, **kwargs): - r = None - l = method(name, attrs, text, 1, **kwargs) - if l: - r = l[0] - return r - - def _findAll(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." - - if isinstance(name, SoupStrainer): - strainer = name - # (Possibly) special case some findAll*(...) searches - elif text is None and not limit and not attrs and not kwargs: - # findAll*(True) - if name is True: - return [element for element in generator() - if isinstance(element, Tag)] - # findAll*('tag-name') - elif isinstance(name, basestring): - return [element for element in generator() - if isinstance(element, Tag) and - element.name == name] - else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - # Build a SoupStrainer - else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - results = ResultSet(strainer) - g = generator() - while True: - try: - i = g.next() - except StopIteration: - break - if i: - found = strainer.search(i) - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - # These Generators can be used to navigate starting from both - # NavigableStrings and Tags. - def nextGenerator(self): - i = self - while i is not None: - i = i.next - yield i - - def nextSiblingGenerator(self): - i = self - while i is not None: - i = i.nextSibling - yield i - - def previousGenerator(self): - i = self - while i is not None: - i = i.previous - yield i - - def previousSiblingGenerator(self): - i = self - while i is not None: - i = i.previousSibling - yield i - - def parentGenerator(self): - i = self - while i is not None: - i = i.parent - yield i - - # Utility methods - def substituteEncoding(self, str, encoding=None): - encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) - - def toEncoding(self, s, encoding=None): - """Encodes an object to a string in some encoding, or to Unicode. - .""" - if isinstance(s, unicode): - if encoding: - s = s.encode(encoding) - elif isinstance(s, str): - if encoding: - s = s.encode(encoding) - else: - s = unicode(s) - else: - if encoding: - s = self.toEncoding(str(s), encoding) - else: - s = unicode(s) - return s - - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - + ")") - - def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" - - -class NavigableString(unicode, PageElement): - - def __new__(cls, value): - """Create a new NavigableString. - - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. - """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) - - def __getnewargs__(self): - return (NavigableString.__str__(self),) - - def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" - if attr == 'string': - return self - else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - - def __unicode__(self): - return str(self).decode(DEFAULT_OUTPUT_ENCODING) - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - # Substitute outgoing XML entities. - data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self) - if encoding: - return data.encode(encoding) - else: - return data - - -class CData(NavigableString): - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - - -class ProcessingInstruction(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - output = self - if "%SOUP-ENCODING%" in output: - output = self.substituteEncoding(output, encoding) - return "" % self.toEncoding(output, encoding) - - -class Comment(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - - -class Declaration(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - - -class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" - - def _convertEntities(self, match): - """Used in a call to re.sub to replace HTML, XML, and numeric - entities with the appropriate Unicode characters. If HTML - entities are being converted, any unrecognized entities are - escaped.""" - x = match.group(1) - if self.convertHTMLEntities and x in name2codepoint: - return unichr(name2codepoint[x]) - elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: - if self.convertXMLEntities: - return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] - else: - return u'&%s;' % x - elif len(x) > 0 and x[0] == '#': - # Handle numeric entities - if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) - else: - return unichr(int(x[1:])) - - elif self.escapeUnrecognizedEntities: - return u'&%s;' % x - else: - return u'&%s;' % x - - def __init__(self, parser, name, attrs=None, parent=None, - previous=None): - "Basic constructor." - - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected - self.parserClass = parser.__class__ - self.isSelfClosing = parser.isSelfClosingTag(name) - self.name = name - if attrs is None: - attrs = [] - elif isinstance(attrs, dict): - attrs = attrs.items() - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - self.containsSubstitutions = False - self.convertHTMLEntities = parser.convertHTMLEntities - self.convertXMLEntities = parser.convertXMLEntities - self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities - - # Convert any HTML, XML, or numeric entities in the attribute values. - def convert((k, val)): return (k, - re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", - self._convertEntities, - val)) - self.attrs = map(convert, self.attrs) - - def getString(self): - if (len(self.contents) == 1 - and isinstance(self.contents[0], NavigableString)): - return self.contents[0] - - def setString(self, string): - """Replace the contents of the tag with a string""" - self.clear() - self.append(string) - - string = property(getString, setString) - - def getText(self, separator=u""): - if not len(self.contents): - return u"" - stopNode = self._lastRecursiveChild().next - strings = [] - current = self.contents[0] - while current is not stopNode: - if isinstance(current, NavigableString): - strings.append(current.strip()) - current = current.next - return separator.join(strings) - - text = property(getText) - - def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self._getAttrMap().get(key, default) - - def clear(self): - """Extract all children.""" - for child in self.contents[:]: - child.extract() - - def index(self, element): - for i, child in enumerate(self.contents): - if child is element: - return i - raise ValueError("Tag.index: element not in tag") - - def has_key(self, key): - return self._getAttrMap().has_key(key) - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" - return self._getAttrMap()[key] - - def __iter__(self): - "Iterating over a tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __nonzero__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self._getAttrMap() - self.attrMap[key] = value - found = False - for i in range(0, len(self.attrs)): - if self.attrs[i][0] == key: - self.attrs[i] = (key, value) - found = True - if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - for item in self.attrs: - if item[0] == key: - self.attrs.remove(item) - # We don't break because bad HTML can define the same - # attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] - - def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - findAll() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return apply(self.findAll, args, kwargs) - - def __getattr__(self, tag): - # print "Getattr %s.%s" % (self.__class__, tag) - if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: - return self.find(tag[:-3]) - elif tag.find('__') != 0: - return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) - - def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - NOTE: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" - if other is self: - return True - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): - return False - for i in range(0, len(self.contents)): - if self.contents[i] != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" - return not self == other - - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): - """Renders this tag as a string.""" - return self.__str__(encoding) - - def __unicode__(self): - return self.__str__(None) - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding. - - NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" - - encodedName = self.toEncoding(self.name, encoding) - - attrs = [] - if self.attrs: - for key, val in self.attrs: - fmt = '%s="%s"' - if isinstance(val, basestring): - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: - val = self.substituteEncoding(val, encoding) - - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. - if '"' in val: - fmt = "%s='%s'" - if "'" in val: - # TODO: replace with apos when - # appropriate. - val = val.replace("'", "&squot;") - - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. - val = self.BARE_AMPERSAND_OR_BRACKET.sub( - self._sub_entity, val) - - attrs.append(fmt % (self.toEncoding(key, encoding), - self.toEncoding(val, encoding))) - close = '' - closeTag = '' - if self.isSelfClosing: - close = ' /' - else: - closeTag = '' % encodedName - - indentTag, indentContents = 0, 0 - if prettyPrint: - indentTag = indentLevel - space = (' ' * (indentTag-1)) - indentContents = indentTag + 1 - contents = self.renderContents(encoding, prettyPrint, indentContents) - if self.hidden: - s = contents - else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if prettyPrint: - s.append(space) - s.append('<%s%s%s>' % (encodedName, attributeString, close)) - if prettyPrint: - s.append("\n") - s.append(contents) - if prettyPrint and contents and contents[-1] != "\n": - s.append("\n") - if prettyPrint and closeTag: - s.append(space) - s.append(closeTag) - if prettyPrint and closeTag and self.nextSibling: - s.append("\n") - s = ''.join(s) - return s - - def decompose(self): - """Recursively destroys the contents of this tree.""" - self.extract() - if len(self.contents) == 0: - return - current = self.contents[0] - while current is not None: - next = current.next - if isinstance(current, Tag): - del current.contents[:] - current.parent = None - current.previous = None - current.previousSibling = None - current.next = None - current.nextSibling = None - current = next - - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.__str__(encoding, True) - - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" - s = [] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.__str__(encoding) - elif isinstance(c, Tag): - s.append(c.__str__(encoding, prettyPrint, indentLevel)) - if text and prettyPrint: - text = text.strip() - if text: - if prettyPrint: - s.append(" " * (indentLevel-1)) - s.append(text) - if prettyPrint: - s.append("\n") - return ''.join(s) - - # Soup methods - - def find(self, name=None, attrs={}, recursive=True, text=None, - **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" - r = None - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r - findChild = find - - def findAll(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursiveChildGenerator - if not recursive: - generator = self.childGenerator - return self._findAll(name, attrs, text, limit, generator, **kwargs) - findChildren = findAll - - # Pre-3.x compatibility methods - first = find - fetch = findAll - - def fetchText(self, text=None, recursive=True, limit=None): - return self.findAll(text=text, recursive=recursive, limit=limit) - - def firstText(self, text=None, recursive=True): - return self.find(text=text, recursive=recursive) - - # Private methods - - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap - - # Generator methods - def childGenerator(self): - # Just use the iterator from the contents - return iter(self.contents) - - def recursiveChildGenerator(self): - if not len(self.contents): - raise StopIteration - stopNode = self._lastRecursiveChild().next - current = self.contents[0] - while current is not stopNode: - yield current - current = current.next - - -# Next, a couple classes to represent queries and their results. -class SoupStrainer: - """Encapsulates a number of ways of matching a markup element (tag or - text).""" - - def __init__(self, name=None, attrs={}, text=None, **kwargs): - self.name = name - if isinstance(attrs, basestring): - kwargs['class'] = _match_css_class(attrs) - attrs = None - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) - else: - attrs = kwargs - self.attrs = attrs - self.text = text - - def __str__(self): - if self.text: - return self.text - else: - return "%s|%s" % (self.name, self.attrs) - - def searchTag(self, markupName=None, markupAttrs={}): - found = None - markup = None - if isinstance(markupName, Tag): - markup = markupName - markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) - - if (not self.name) \ - or callFunctionWithTagData \ - or (markup and self._matches(markup, self.name)) \ - or (not markup and self._matches(markupName, self.name)): - if callFunctionWithTagData: - match = self.name(markupName, markupAttrs) - else: - match = True - markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): - if not markupAttrMap: - if hasattr(markupAttrs, 'get'): - markupAttrMap = markupAttrs - else: - markupAttrMap = {} - for k, v in markupAttrs: - markupAttrMap[k] = v - attrValue = markupAttrMap.get(attr) - if not self._matches(attrValue, matchAgainst): - match = False - break - if match: - if markup: - found = markup - else: - found = markupName - return found - - def search(self, markup): - # print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. - if hasattr(markup, "__iter__") \ - and not isinstance(markup, Tag): - for element in markup: - if isinstance(element, NavigableString) \ - and self.search(element): - found = element - break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. - elif isinstance(markup, Tag): - if not self.text: - found = self.searchTag(markup) - # If it's text, make sure the text matches. - elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): - if self._matches(markup, self.text): - found = markup - else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ - return found - - def _matches(self, markup, matchAgainst): - # print "Matching %s against %s" % (markup, matchAgainst) - result = False - if matchAgainst is True: - result = markup is not None - elif callable(matchAgainst): - result = matchAgainst(markup) - else: - # Custom match methods take the tag as an argument, but all - # other ways of matching match the tag name as a string. - if isinstance(markup, Tag): - markup = markup.name - if markup and not isinstance(markup, basestring): - markup = unicode(markup) - # Now we know that chunk is either a string, or None. - if hasattr(matchAgainst, 'match'): - # It's a regexp object. - result = markup and matchAgainst.search(markup) - elif hasattr(matchAgainst, '__iter__'): # list-like - result = markup in matchAgainst - elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) - else: - matchAgainst = str(matchAgainst) - - if not result: - result = matchAgainst == markup - return result - - -class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" - - def __init__(self, source): - list.__init__([]) - self.source = source - -# Now, some helper functions. - - -def buildTagMap(default, *args): - """Turns a list of maps, lists, or scalars into a single map. - Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and - NESTING_RESET_TAGS maps out of lists and partial maps.""" - built = {} - for portion in args: - if hasattr(portion, 'items'): - # It's a map. Merge it. - for k, v in portion.items(): - built[k] = v - elif hasattr(portion, '__iter__'): # is a list - # It's a list. Map each item to the default. - for k in portion: - built[k] = default - else: - # It's a scalar. Map it to the default. - built[portion] = default - return built - -# Now, the parser classes. - - -class BeautifulStoneSoup(Tag, SGMLParser): - - """This class contains the basic parser and search code. It defines - a parser that knows nothing about tag behavior except for the - following: - - You can't close a tag without closing all the tags it encloses. - That is, "" actually means - "". - - [Another possible explanation is "", but since - this class defines no SELF_CLOSING_TAGS, it will never use that - explanation.] - - This class is useful for parsing XML or made-up markup languages, - or when BeautifulSoup makes an assumption counter to what you were - expecting.""" - - SELF_CLOSING_TAGS = {} - NESTABLE_TAGS = {} - RESET_NESTING_TAGS = {} - QUOTE_TAGS = {} - PRESERVE_WHITESPACE_TAGS = [] - - MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), - lambda x: x.group(1) + ' />'), - (re.compile(']*)>'), - lambda x: '') - ] - - ROOT_TAG_NAME = u'[document]' - - HTML_ENTITIES = "html" - XML_ENTITIES = "xml" - XHTML_ENTITIES = "xhtml" - # TODO: This only exists for backwards-compatibility - ALL_ENTITIES = XHTML_ENTITIES - - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. - STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } - - def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, - markupMassage=True, smartQuotesTo=XML_ENTITIES, - convertEntities=None, selfClosingTags=None, isHTML=False): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser. - - sgmllib will process most bad HTML, and the BeautifulSoup - class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data - if your data uses self-closing tags or declarations - incorrectly. - - By default, Beautiful Soup uses regexes to sanitize input, - avoiding the vast majority of these problems. If the problems - don't apply to you, pass in False for markupMassage, and - you'll get better performance. - - The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: - -
(No space between name of closing tag and tag close) - (Extraneous whitespace in declaration) - - You can pass in a custom list of (RE object, replace method) - tuples to get Beautiful Soup to scrub your input the way you - want.""" - - self.parseOnlyThese = parseOnlyThese - self.fromEncoding = fromEncoding - self.smartQuotesTo = smartQuotesTo - self.convertEntities = convertEntities - # Set the rules for how we'll deal with the entities we - # encounter - if self.convertEntities: - # It doesn't make sense to convert encoded characters to - # entities even while you're converting entities to Unicode. - # Just convert it all to Unicode. - self.smartQuotesTo = None - if convertEntities == self.HTML_ENTITIES: - self.convertXMLEntities = False - self.convertHTMLEntities = True - self.escapeUnrecognizedEntities = True - elif convertEntities == self.XHTML_ENTITIES: - self.convertXMLEntities = True - self.convertHTMLEntities = True - self.escapeUnrecognizedEntities = False - elif convertEntities == self.XML_ENTITIES: - self.convertXMLEntities = True - self.convertHTMLEntities = False - self.escapeUnrecognizedEntities = False - else: - self.convertXMLEntities = False - self.convertHTMLEntities = False - self.escapeUnrecognizedEntities = False - - self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) - SGMLParser.__init__(self) - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - self.markup = markup - self.markupMassage = markupMassage - try: - self._feed(isHTML=isHTML) - except StopParsing: - pass - self.markup = None # The markup can now be GCed - - def convert_charref(self, name): - """This method fixes a bug in Python's SGMLParser.""" - try: - n = int(name) - except ValueError: - return - if not 0 <= n <= 127: # ASCII ends at 127, not 255 - return - return self.convert_codepoint(n) - - def _feed(self, inDocumentEncoding=None, isHTML=False): - # Convert the document to Unicode. - markup = self.markup - if isinstance(markup, unicode): - if not hasattr(self, 'originalEncoding'): - self.originalEncoding = None - else: - dammit = UnicodeDammit(markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) - markup = dammit.unicode - self.originalEncoding = dammit.originalEncoding - self.declaredHTMLEncoding = dammit.declaredHTMLEncoding - if markup: - if self.markupMassage: - if not hasattr(self.markupMassage, "__iter__"): - self.markupMassage = self.MARKUP_MASSAGE - for fix, m in self.markupMassage: - markup = fix.sub(m, markup) - # TODO: We get rid of markupMassage so that the - # soup object can be deepcopied later on. Some - # Python installations can't copy regexes. If anyone - # was relying on the existence of markupMassage, this - # might cause problems. - del(self.markupMassage) - self.reset() - - SGMLParser.feed(self, markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - # print "__getattr__ called on %s.%s" % (self.__class__, methodName) - - if methodName.startswith('start_') or methodName.startswith('end_') \ - or methodName.startswith('do_'): - return SGMLParser.__getattr__(self, methodName) - elif not methodName.startswith('__'): - return Tag.__getattr__(self, methodName) - else: - raise AttributeError - - def isSelfClosingTag(self, name): - """Returns true iff the given string is the name of a - self-closing tag according to this parser.""" - return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) - - def reset(self): - Tag.__init__(self, self, self.ROOT_TAG_NAME) - self.hidden = 1 - SGMLParser.reset(self) - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.quoteStack = [] - self.pushTag(self) - - def popTag(self): - tag = self.tagStack.pop() - - # print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - # print "Push", tag.name - if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self, containerClass=NavigableString): - if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.PRESERVE_WHITESPACE_TAGS)): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - self.currentData = [] - if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or - not self.parseOnlyThese.search(currentData)): - return - o = containerClass(currentData) - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) - - def _popToTag(self, name, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - # print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - for i in range(len(self.tagStack)-1, 0, -1): - if name == self.tagStack[i].name: - numPops = len(self.tagStack)-i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def _smartPop(self, name): - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: -

FooBar *

* should pop to 'p', not 'b'. -

FooBar *

* should pop to 'table', not 'p'. -

Foo

Bar *

* should pop to 'tr', not 'p'. - -

    • *
    • * should pop to 'ul', not the first 'li'. -
  • ** should pop to 'table', not the first 'tr' - tag should - implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' - """ - - nestingResetTriggers = self.NESTABLE_TAGS.get(name) - isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) - popTo = None - inclusive = True - for i in range(len(self.tagStack)-1, 0, -1): - p = self.tagStack[i] - if (not p or p.name == name) and not isNestable: - # Non-nestable tags get popped to the top or to their - # last occurance. - popTo = name - break - if (nestingResetTriggers is not None - and p.name in nestingResetTriggers) \ - or (nestingResetTriggers is None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): - - # If we encounter one of the nesting reset triggers - # peculiar to this tag, or we encounter another tag - # that causes nesting to reset, pop up to but not - # including that tag. - popTo = p.name - inclusive = False - break - p = p.parent - if popTo: - self._popToTag(popTo, inclusive) - - def unknown_starttag(self, name, attrs, selfClosing=0): - # print "Start tag %s: %s" % (name, attrs) - if self.quoteStack: - # This is not a real tag. - # print "<%s> is not real!" % name - attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) - self.handle_data('<%s%s>' % (name, attrs)) - return - self.endData() - - if not self.isSelfClosingTag(name) and not selfClosing: - self._smartPop(name) - - if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): - return - - tag = Tag(self, name, attrs, self.currentTag, self.previous) - if self.previous: - self.previous.next = tag - self.previous = tag - self.pushTag(tag) - if selfClosing or self.isSelfClosingTag(name): - self.popTag() - if name in self.QUOTE_TAGS: - # print "Beginning quote (%s)" % name - self.quoteStack.append(name) - self.literal = 1 - return tag - - def unknown_endtag(self, name): - # print "End tag %s" % name - if self.quoteStack and self.quoteStack[-1] != name: - # This is not a real end tag. - # print " is not real!" % name - self.handle_data('' % name) - return - self.endData() - self._popToTag(name) - if self.quoteStack and self.quoteStack[-1] == name: - self.quoteStack.pop() - self.literal = (len(self.quoteStack) > 0) - - def handle_data(self, data): - self.currentData.append(data) - - def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.endData() - self.handle_data(text) - self.endData(subclass) - - def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" - if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" - self._toStringSubclass(text, ProcessingInstruction) - - def handle_comment(self, text): - "Handle comments as Comment objects." - self._toStringSubclass(text, Comment) - - def handle_charref(self, ref): - "Handle character references as data." - if self.convertEntities: - data = unichr(int(ref)) - else: - data = '&#%s;' % ref - self.handle_data(data) - - def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" - data = None - if self.convertHTMLEntities: - try: - data = unichr(name2codepoint[ref]) - except KeyError: - pass - - if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) - - if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref - if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. - data = "&%s;" % ref - self.handle_data(data) - - def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." - self._toStringSubclass(data, Declaration) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - - -class BeautifulSoup(BeautifulStoneSoup): - - """This parser knows the following facts about HTML: - - * Some tags have no closing tag and should be interpreted as being - closed as soon as they are encountered. - - * The text inside some tags (ie. 'script') may contain tags which - are not really part of the document and which should be parsed - as text, not tags. If you want to parse the text as tags, you can - always fetch it and parse it explicitly. - - * Tag nesting rules: - - Most tags can't be nested at all. For instance, the occurance of - a

    tag should implicitly close the previous

    tag. - -

    Para1

    Para2 - should be transformed into: -

    Para1

    Para2 - - Some tags can be nested arbitrarily. For instance, the occurance - of a

    tag should _not_ implicitly close the previous -
    tag. - - Alice said:
    Bob said:
    Blah - should NOT be transformed into: - Alice said:
    Bob said:
    Blah - - Some tags can be nested, but the nesting is reset by the - interposition of other tags. For instance, a
    , - but not close a tag in another table. - -
    BlahBlah - should be transformed into: -
    BlahBlah - but, - Blah
    Blah - should NOT be transformed into - Blah
    Blah - - Differing assumptions about tag nesting rules are a major source - of problems with the BeautifulSoup class. If BeautifulSoup is not - treating as nestable a tag your page author treats as nestable, - try ICantBelieveItsBeautifulSoup, MinimalSoup, or - BeautifulStoneSoup before writing your own subclass.""" - - def __init__(self, *args, **kwargs): - if not kwargs.has_key('smartQuotesTo'): - kwargs['smartQuotesTo'] = self.HTML_ENTITIES - kwargs['isHTML'] = True - BeautifulStoneSoup.__init__(self, *args, **kwargs) - - SELF_CLOSING_TAGS = buildTagMap(None, - ('br', 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base', 'col')) - - PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) - - QUOTE_TAGS = {'script': None, 'textarea': None} - - # According to the HTML standard, each of these inline tags can - # contain another tag of the same type. Furthermore, it's common - # to actually use these tags this way. - NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', - 'center') - - # According to the HTML standard, these block tags can contain - # another tag of the same type. Furthermore, it's common - # to actually use these tags this way. - NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') - - # Lists can contain other lists, but there are restrictions. - NESTABLE_LIST_TAGS = {'ol': [], - 'ul': [], - 'li': ['ul', 'ol'], - 'dl': [], - 'dd': ['dl'], - 'dt': ['dl']} - - # Tables can contain other tables, but there are restrictions. - NESTABLE_TABLE_TAGS = {'table': [], - 'tr': ['table', 'tbody', 'tfoot', 'thead'], - 'td': ['tr'], - 'th': ['tr'], - 'thead': ['table'], - 'tbody': ['table'], - 'tfoot': ['table'], - } - - NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') - - # If one of these tags is encountered, all tags up to the next tag of - # this type are popped. - RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', - NON_NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, - NESTABLE_TABLE_TAGS) - - NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) - - # Used to detect the charset in a META tag; see start_meta - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - - def start_meta(self, attrs): - """Beautiful Soup can detect a charset included in a META tag, - try to convert the document to that charset, and re-parse the - document from the beginning.""" - httpEquiv = None - contentType = None - contentTypeIndex = None - tagNeedsEncodingSubstitution = False - - for i in range(0, len(attrs)): - key, value = attrs[i] - key = key.lower() - if key == 'http-equiv': - httpEquiv = value - elif key == 'content': - contentType = value - contentTypeIndex = i - - if httpEquiv and contentType: # It's an interesting meta tag. - match = self.CHARSET_RE.search(contentType) - if match: - if (self.declaredHTMLEncoding is not None or - self.originalEncoding == self.fromEncoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - newAttr = self.CHARSET_RE.sub(rewrite, contentType) - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], - newAttr) - tagNeedsEncodingSubstitution = True - else: - # This is our first pass through the document. - # Go through it again with the encoding information. - newCharset = match.group(3) - if newCharset and newCharset != self.originalEncoding: - self.declaredHTMLEncoding = newCharset - self._feed(self.declaredHTMLEncoding) - raise StopParsing - pass - tag = self.unknown_starttag("meta", attrs) - if tag and tagNeedsEncodingSubstitution: - tag.containsSubstitutions = True - - -class StopParsing(Exception): - pass - - -class ICantBelieveItsBeautifulSoup(BeautifulSoup): - - """The BeautifulSoup class is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: - - FooBar - - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "FooBar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close a 'b' tag - than to actually use nested 'b' tags, and the BeautifulSoup class - handles the common case. This class handles the not-co-common - case: where you can't believe someone wrote what they did, but - it's valid HTML and BeautifulSoup screwed up by assuming it - wouldn't be.""" - - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ - ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big') - - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) - - NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) - - -class MinimalSoup(BeautifulSoup): - """The MinimalSoup class is for parsing HTML that contains - pathologically bad markup. It makes no assumptions about tag - nesting, but it does know which tags are self-closing, that - ' - for text, lspace, funcdecl, rspace, end_p in self._scan_chunks(input, filename): - if end_p: - break - if funcdecl: - buf.append(text) - if re.match(r'^\$?\w+\(', funcdecl): - buf.extend((lspace or '', stag, 'function ', - funcdecl, "{var _buf='';", rspace or '')) - else: - m = re.match(r'(.+?)\((.*)\)', funcdecl) - buf.extend((lspace or '', stag, m.group( - 1), '=function(', m.group(2), "){var _buf='';", rspace or '')) - else: - self._parse_stmts(text, buf) - buf.extend( - (lspace or '', "return _buf;};", etag, rspace or '')) - # - buf.append(text) - - STMT_REXP = re.compile( - r'(?:^( *)<|<)\?js(\s.*?) ?\?>([ \t]*\r?\n)?', re.M | re.S) - - def _scan_stmts(self, input): - rexp = self.STMT_REXP - pos = 0 - for m in rexp.finditer(input): - lspace, code, rspace = m.groups() - text = input[pos:m.start()] - pos = m.end() - yield text, lspace, code, rspace, False - rest = input[pos:] - yield rest, None, None, None, True - - def _parse_stmts(self, input, buf): - if not input: - return - for text, lspace, code, rspace, end_p in self._scan_stmts(input): - if end_p: - break - if lspace is not None and rspace is not None: - self._parse_exprs(text, buf) - buf.extend((lspace, code, rspace)) - else: - if lspace: - text += lspace - self._parse_exprs(text, buf) - buf.append(code) - if rspace: - self._parse_exprs(rspace, buf) - if text: - self._parse_exprs(text, buf) - - s = r'(?:\{[^{}]*?\}[^{}]*?)*' - EXPR_REXP = re.compile(r'\{=(.*?)=\}|([$#])\{(.*?' + s + r')\}', re.S) - del s - - def _get_expr(self, m): - code1, ch, code2 = m.groups() - if ch: - code = code2 - escape_p = ch == '$' - elif code1[0] == code1[-1] == '=': - code = code1[1:-1] - escape_p = False - else: - code = code1 - escape_p = True - return code, escape_p - - def _scan_exprs(self, input): - rexp = self.EXPR_REXP - pos = 0 - for m in rexp.finditer(input): - text = input[pos:m.start()] - pos = m.end() - code, escape_p = self._get_expr(m) - yield text, code, escape_p, False - rest = input[pos:] - yield rest, None, None, True - - def _parse_exprs(self, input, buf): - if not input: - return - buf.append("_buf+=") - extend = buf.extend - op = '' - for text, code, escape_p, end_p in self._scan_exprs(input): - if end_p: - break - if text: - extend((op, self._escape_text(text))) - op = '+' - if code: - extend((op, escape_p and '_E(' or '_S(', code, ')')) - op = '+' - rest = text - if rest: - extend((op, self._escape_text(rest))) - if input.endswith("\n"): - buf.append(";\n") - else: - buf.append(";") - - def _escape_text(self, text): - lines = text.splitlines(True) - fn = self._escape_str - s = "\\\n".join(fn(line) for line in lines) - return "".join(("'", s, "'")) - - def _escape_str(self, string): - return string.replace("\\", "\\\\").replace("'", "\\'").replace("\n", r"\n") - - -def _linenum(input, pos): - return input[0:pos].count("\n") + 1 - - -JS_FUNC = r""" -function _S(x){return x==null?'':x;} -function _E(x){return x==null?'':typeof(x)!=='string'?x:x.replace(/[&<>"']/g,_EF);} -var _ET={'&':"&",'<':"<",'>':">",'"':""","'":"'"}; -function _EF(c){return _ET[c];}; -"""[1:-1] -JS_FUNC = escaped.EscapedStr(JS_FUNC) - - -## -# cache storages -## - -class CacheStorage(object): - """[abstract] Template object cache class (in memory and/or file)""" - - def __init__(self): - self.items = {} # key: full path, value: template object - - def get(self, cachepath, create_template): - """get template object. if not found, load attributes from cache file and restore template object.""" - template = self.items.get(cachepath) - if not template: - dct = self._load(cachepath) - if dct: - template = create_template() - for k in dct: - setattr(template, k, dct[k]) - self.items[cachepath] = template - return template - - def set(self, cachepath, template): - """set template object and save template attributes into cache file.""" - self.items[cachepath] = template - dct = self._save_data_of(template) - return self._store(cachepath, dct) - - def _save_data_of(self, template): - return {'args': template.args, 'bytecode': template.bytecode, - 'script': template.script, 'timestamp': template.timestamp} - - def unset(self, cachepath): - """remove template object from dict and cache file.""" - self.items.pop(cachepath, None) - return self._delete(cachepath) - - def clear(self): - """remove all template objects and attributes from dict and cache file.""" - d, self.items = self.items, {} - for k in d.iterkeys(): - self._delete(k) - d.clear() - - def _load(self, cachepath): - """(abstract) load dict object which represents template object attributes from cache file.""" - raise NotImplementedError.new( - "%s#_load(): not implemented yet." % self.__class__.__name__) - - def _store(self, cachepath, template): - """(abstract) load dict object which represents template object attributes from cache file.""" - raise NotImplementedError.new( - "%s#_store(): not implemented yet." % self.__class__.__name__) - - def _delete(self, cachepath): - """(abstract) remove template object from cache file.""" - raise NotImplementedError.new( - "%s#_delete(): not implemented yet." % self.__class__.__name__) - - -class MemoryCacheStorage(CacheStorage): - - def _load(self, cachepath): - return None - - def _store(self, cachepath, template): - pass - - def _delete(self, cachepath): - pass - - -class FileCacheStorage(CacheStorage): - - def _load(self, cachepath): - if not _isfile(cachepath): - return None - if logger: - logger.info("[tenjin.%s] load cache (file=%r)" % - (self.__class__.__name__, cachepath)) - data = _read_binary_file(cachepath) - return self._restore(data) - - def _store(self, cachepath, dct): - if logger: - logger.info("[tenjin.%s] store cache (file=%r)" % - (self.__class__.__name__, cachepath)) - data = self._dump(dct) - _write_binary_file(cachepath, data) - - def _restore(self, data): - raise NotImplementedError( - "%s._restore(): not implemented yet." % self.__class__.__name__) - - def _dump(self, dct): - raise NotImplementedError( - "%s._dump(): not implemented yet." % self.__class__.__name__) - - def _delete(self, cachepath): - _ignore_not_found_error(lambda: os.unlink(cachepath)) - - -class MarshalCacheStorage(FileCacheStorage): - - def _restore(self, data): - return marshal.loads(data) - - def _dump(self, dct): - return marshal.dumps(dct) - - -class PickleCacheStorage(FileCacheStorage): - - def __init__(self, *args, **kwargs): - global pickle - if pickle is None: - import cPickle as pickle - FileCacheStorage.__init__(self, *args, **kwargs) - - def _restore(self, data): - return pickle.loads(data) - - def _dump(self, dct): - dct.pop('bytecode', None) - return pickle.dumps(dct) - - -class TextCacheStorage(FileCacheStorage): - - def _restore(self, data): - header, script = data.split("\n\n", 1) - timestamp = encoding = args = None - for line in header.split("\n"): - key, val = line.split(": ", 1) - if key == 'timestamp': - timestamp = float(val) - elif key == 'encoding': - encoding = val - elif key == 'args': - args = val.split(', ') - if encoding: - script = script.decode(encoding) # binary(=str) to unicode - return {'args': args, 'script': script, 'timestamp': timestamp} - - def _dump(self, dct): - s = dct['script'] - if dct.get('encoding') and isinstance(s, unicode): - s = s.encode(dct['encoding']) # unicode to binary(=str) - sb = [] - sb.append("timestamp: %s\n" % dct['timestamp']) - if dct.get('encoding'): - sb.append("encoding: %s\n" % dct['encoding']) - if dct.get('args') is not None: - sb.append("args: %s\n" % ', '.join(dct['args'])) - sb.append("\n") - sb.append(s) - s = ''.join(sb) - if python3: - if isinstance(s, str): - # unicode(=str) to binary - s = s.encode(dct.get('encoding') or 'utf-8') - return s - - def _save_data_of(self, template): - dct = FileCacheStorage._save_data_of(self, template) - dct['encoding'] = template.encoding - return dct - - -## -# abstract class for data cache -## -class KeyValueStore(object): - - def get(self, key, *options): - raise NotImplementedError( - "%s.get(): not implemented yet." % self.__class__.__name__) - - def set(self, key, value, *options): - raise NotImplementedError( - "%s.set(): not implemented yet." % self.__class__.__name__) - - def delete(self, key, *options): - raise NotImplementedError( - "%s.del(): not implemented yet." % self.__class__.__name__) - - def has(self, key, *options): - raise NotImplementedError( - "%s.has(): not implemented yet." % self.__class__.__name__) - - -## -# memory base data cache -## -class MemoryBaseStore(KeyValueStore): - - def __init__(self): - self.values = {} - - def get(self, key, original_timestamp=None): - tupl = self.values.get(key) - if not tupl: - return None - value, created_at, expires_at = tupl - if original_timestamp is not None and created_at < original_timestamp: - self.delete(key) - return None - if expires_at < _time(): - self.delete(key) - return None - return value - - def set(self, key, value, lifetime=0): - created_at = _time() - expires_at = lifetime and created_at + lifetime or 0 - self.values[key] = (value, created_at, expires_at) - return True - - def delete(self, key): - try: - del self.values[key] - return True - except KeyError: - return False - - def has(self, key): - pair = self.values.get(key) - if not pair: - return False - value, created_at, expires_at = pair - if expires_at and expires_at < _time(): - self.delete(key) - return False - return True - - -## -# file base data cache -## -class FileBaseStore(KeyValueStore): - - lifetime = 604800 # = 60*60*24*7 - - def __init__(self, root_path, encoding=None): - if not os.path.isdir(root_path): - raise ValueError("%r: directory not found." % (root_path, )) - self.root_path = root_path - if encoding is None and python3: - encoding = 'utf-8' - self.encoding = encoding - - _pat = re.compile(r'[^-.\/\w]') - - def filepath(self, key, _pat1=_pat): - return os.path.join(self.root_path, _pat1.sub('_', key)) - - def get(self, key, original_timestamp=None): - fpath = self.filepath(key) - # if not _isfile(fpath): return None - stat = _ignore_not_found_error(lambda: os.stat(fpath), None) - if stat is None: - return None - created_at = stat.st_ctime - expires_at = stat.st_mtime - if original_timestamp is not None and created_at < original_timestamp: - self.delete(key) - return None - if expires_at < _time(): - self.delete(key) - return None - if self.encoding: - def f(): return _read_text_file(fpath, self.encoding) - else: - def f(): return _read_binary_file(fpath) - return _ignore_not_found_error(f, None) - - def set(self, key, value, lifetime=0): - fpath = self.filepath(key) - dirname = os.path.dirname(fpath) - if not os.path.isdir(dirname): - os.makedirs(dirname) - now = _time() - if isinstance(value, _unicode): - value = value.encode(self.encoding or 'utf-8') - _write_binary_file(fpath, value) - expires_at = now + (lifetime or self.lifetime) # timestamp - os.utime(fpath, (expires_at, expires_at)) - return True - - def delete(self, key): - fpath = self.filepath(key) - ret = _ignore_not_found_error(lambda: os.unlink(fpath), False) - return ret != False - - def has(self, key): - fpath = self.filepath(key) - if not _isfile(fpath): - return False - if _getmtime(fpath) < _time(): - self.delete(key) - return False - return True - - -## -# html fragment cache helper class -## -class FragmentCacheHelper(object): - """html fragment cache helper class.""" - - lifetime = 60 # 1 minute - prefix = None - - def __init__(self, store, lifetime=None, prefix=None): - self.store = store - if lifetime is not None: - self.lifetime = lifetime - if prefix is not None: - self.prefix = prefix - - def not_cached(self, cache_key, lifetime=None): - """(obsolete. use cache_as() instead of this.) - html fragment cache helper. see document of FragmentCacheHelper class.""" - context = sys._getframe(1).f_locals['_context'] - context['_cache_key'] = cache_key - key = self.prefix and self.prefix + cache_key or cache_key - value = self.store.get(key) - if value: # cached - if logger: - logger.debug('[tenjin.not_cached] %r: cached.' % (cache_key, )) - context[key] = value - return False - else: # not cached - if logger: - logger.debug( - '[tenjin.not_cached]: %r: not cached.' % (cache_key, )) - if key in context: - del context[key] - if lifetime is None: - lifetime = self.lifetime - context['_cache_lifetime'] = lifetime - helpers.start_capture(cache_key, _depth=2) - return True - - def echo_cached(self): - """(obsolete. use cache_as() instead of this.) - html fragment cache helper. see document of FragmentCacheHelper class.""" - f_locals = sys._getframe(1).f_locals - context = f_locals['_context'] - cache_key = context.pop('_cache_key') - key = self.prefix and self.prefix + cache_key or cache_key - if key in context: # cached - value = context.pop(key) - else: # not cached - value = helpers.stop_capture(False, _depth=2) - lifetime = context.pop('_cache_lifetime') - self.store.set(key, value, lifetime) - f_locals['_buf'].append(value) - - def functions(self): - """(obsolete. use cache_as() instead of this.)""" - return (self.not_cached, self.echo_cached) - - def cache_as(self, cache_key, lifetime=None): - key = self.prefix and self.prefix + cache_key or cache_key - _buf = sys._getframe(1).f_locals['_buf'] - value = self.store.get(key) - if value: - if logger: - logger.debug('[tenjin.cache_as] %r: cache found.' % - (cache_key, )) - _buf.append(value) - else: - if logger: - logger.debug( - '[tenjin.cache_as] %r: expired or not cached yet.' % (cache_key, )) - _buf_len = len(_buf) - yield None - value = ''.join(_buf[_buf_len:]) - self.store.set(key, value, lifetime) - - -# you can change default store by 'tenjin.helpers.fragment_cache.store = ...' -helpers.fragment_cache = FragmentCacheHelper(MemoryBaseStore()) -helpers.not_cached = helpers.fragment_cache.not_cached -helpers.echo_cached = helpers.fragment_cache.echo_cached -helpers.cache_as = helpers.fragment_cache.cache_as -helpers.__all__.extend(('not_cached', 'echo_cached', 'cache_as')) - - -## -# helper class to find and read template -## -class Loader(object): - - def exists(self, filepath): - raise NotImplementedError( - "%s.exists(): not implemented yet." % self.__class__.__name__) - - def find(self, filename, dirs=None): - #: if dirs provided then search template file from it. - if dirs: - for dirname in dirs: - filepath = os.path.join(dirname, filename) - if self.exists(filepath): - return filepath - #: if dirs not provided then just return filename if file exists. - else: - if self.exists(filename): - return filename - #: if file not found then return None. - return None - - def abspath(self, filename): - raise NotImplementedError( - "%s.abspath(): not implemented yet." % self.__class__.__name__) - - def timestamp(self, filepath): - raise NotImplementedError( - "%s.timestamp(): not implemented yet." % self.__class__.__name__) - - def load(self, filepath): - raise NotImplementedError( - "%s.timestamp(): not implemented yet." % self.__class__.__name__) - - -## -# helper class to find and read files -## -class FileSystemLoader(Loader): - - def exists(self, filepath): - #: return True if filepath exists as a file. - return os.path.isfile(filepath) - - def abspath(self, filepath): - #: return full-path of filepath - return os.path.abspath(filepath) - - def timestamp(self, filepath): - #: return mtime of file - return _getmtime(filepath) - - def load(self, filepath): - #: if file exists, return file content and mtime - def f(): - mtime = _getmtime(filepath) - input = _read_template_file(filepath) - mtime2 = _getmtime(filepath) - if mtime != mtime2: - mtime = mtime2 - input = _read_template_file(filepath) - mtime2 = _getmtime(filepath) - if mtime != mtime2: - if logger: - logger.warn( - "[tenjin] %s.load(): timestamp is changed while reading file." % self.__class__.__name__) - return input, mtime - #: if file not exist, return None - return _ignore_not_found_error(f) - - -## -## -## -class TemplateNotFoundError(Exception): - pass - - -## -# template engine class -## - -class Engine(object): - """Template Engine class. - See User's Guide and examples for details. - http://www.kuwata-lab.com/tenjin/pytenjin-users-guide.html - http://www.kuwata-lab.com/tenjin/pytenjin-examples.html - """ - - # default value of attributes - prefix = '' - postfix = '' - layout = None - templateclass = Template - path = None - cache = TextCacheStorage() # save converted Python code into text file - lang = None - loader = FileSystemLoader() - preprocess = False - preprocessorclass = Preprocessor - timestamp_interval = 1 # seconds - - def __init__(self, prefix=None, postfix=None, layout=None, path=None, cache=True, preprocess=None, templateclass=None, preprocessorclass=None, lang=None, loader=None, pp=None, **kwargs): - """Initializer of Engine class. - - prefix:str (='') - Prefix string used to convert template short name to template filename. - postfix:str (='') - Postfix string used to convert template short name to template filename. - layout:str (=None) - Default layout template name. - path:list of str(=None) - List of directory names which contain template files. - cache:bool or CacheStorage instance (=True) - Cache storage object to store converted python code. - If True, default cache storage (=Engine.cache) is used (if it is None - then create MarshalCacheStorage object for each engine object). - If False, no cache storage is used nor no cache files are created. - preprocess:bool(=False) - Activate preprocessing or not. - templateclass:class (=Template) - Template class which engine creates automatically. - lang:str (=None) - Language name such as 'en', 'fr', 'ja', and so on. If you specify - this, cache file path will be 'inex.html.en.cache' for example. - pp:list (=None) - List of preprocessor object which is callable and manipulates template content. - kwargs:dict - Options for Template class constructor. - See document of Template.__init__() for details. - """ - if prefix: - self.prefix = prefix - if postfix: - self.postfix = postfix - if layout: - self.layout = layout - if templateclass: - self.templateclass = templateclass - if preprocessorclass: - self.preprocessorclass = preprocessorclass - if path is not None: - self.path = path - if lang is not None: - self.lang = lang - if loader is not None: - self.loader = loader - if preprocess is not None: - self.preprocess = preprocess - if pp is None: - pp = [] - elif isinstance(pp, list): - pass - elif isinstance(pp, tuple): - pp = list(pp) - else: - raise TypeError("'pp' expected to be a list but got %r." % (pp,)) - self.pp = pp - if preprocess: - self.pp.append(TemplatePreprocessor(self.preprocessorclass)) - self.kwargs = kwargs - self.encoding = kwargs.get('encoding') - self._filepaths = {} # template_name => relative path and absolute path - self._added_templates = {} # templates added by add_template() - #self.cache = cache - self._set_cache_storage(cache) - - def _set_cache_storage(self, cache): - if cache is True: - if not self.cache: - self.cache = MarshalCacheStorage() - elif cache is None: - pass - elif cache is False: - self.cache = None - elif isinstance(cache, CacheStorage): - self.cache = cache - else: - raise ValueError("%r: invalid cache object." % (cache, )) - - def cachename(self, filepath): - #: if lang is provided then add it to cache filename. - if self.lang: - return '%s.%s.cache' % (filepath, self.lang) - #: return cache file name. - else: - return filepath + '.cache' - - def to_filename(self, template_name): - """Convert template short name into filename. - ex. - >>> engine = tenjin.Engine(prefix='user_', postfix='.pyhtml') - >>> engine.to_filename(':list') - 'user_list.pyhtml' - >>> engine.to_filename('list') - 'list' - """ - #: if template_name starts with ':', add prefix and postfix to it. - if template_name[0] == ':': - return self.prefix + template_name[1:] + self.postfix - #: if template_name doesn't start with ':', just return it. - return template_name - - def _create_template(self, input=None, filepath=None, _context=None, _globals=None): - #: if input is not specified then just create empty template object. - template = self.templateclass(None, **self.kwargs) - #: if input is specified then create template object and return it. - if input: - template.convert(input, filepath) - return template - - def _preprocess(self, input, filepath, _context, _globals): - #if _context is None: _context = {} - #if _globals is None: _globals = sys._getframe(3).f_globals - #: preprocess template and return result - #preprocessor = self.preprocessorclass(filepath, input=input) - # return preprocessor.render(_context, globals=_globals) - #: preprocesses input with _context and returns result. - if '_engine' not in _context: - self.hook_context(_context) - for pp in self.pp: - input = pp.__call__(input, filename=filepath, - context=_context, globals=_globals) - return input - - def add_template(self, template): - self._added_templates[template.filename] = template - - def _get_template_from_cache(self, cachepath, filepath): - #: if template not found in cache, return None - template = self.cache.get(cachepath, self.templateclass) - if not template: - return None - assert template.timestamp is not None - #: if checked within a sec, skip timestamp check. - now = _time() - last_checked = getattr(template, '_last_checked_at', None) - if last_checked and now < last_checked + self.timestamp_interval: - # if logger: logger.trace('[tenjin.%s] timestamp check skipped (%f < %f + %f)' % \ - # (self.__class__.__name__, now, template._last_checked_at, self.timestamp_interval)) - return template - #: if timestamp of template objectis same as file, return it. - if template.timestamp == self.loader.timestamp(filepath): - template._last_checked_at = now - return template - #: if timestamp of template object is different from file, clear it - # cache._delete(cachepath) - if logger: - logger.info("[tenjin.%s] cache expired (filepath=%r)" % - (self.__class__.__name__, filepath)) - return None - - def get_template(self, template_name, _context=None, _globals=None): - """Return template object. - If template object has not registered, template engine creates - and registers template object automatically. - """ - #: accept template_name such as ':index'. - filename = self.to_filename(template_name) - #: if template object is added by add_template(), return it. - if filename in self._added_templates: - return self._added_templates[filename] - #: get filepath and fullpath of template - pair = self._filepaths.get(filename) - if pair: - filepath, fullpath = pair - else: - #: if template file is not found then raise TemplateNotFoundError. - filepath = self.loader.find(filename, self.path) - if not filepath: - raise TemplateNotFoundError( - '%s: filename not found (path=%r).' % (filename, self.path)) - # - fullpath = self.loader.abspath(filepath) - self._filepaths[filename] = (filepath, fullpath) - #: use full path as base of cache file path - cachepath = self.cachename(fullpath) - #: get template object from cache - cache = self.cache - template = cache and self._get_template_from_cache( - cachepath, filepath) or None - #: if template object is not found in cache or is expired... - if not template: - ret = self.loader.load(filepath) - if not ret: - raise TemplateNotFoundError( - "%r: template not found." % filepath) - input, timestamp = ret - if self.pp: # required for preprocessing - if _context is None: - _context = {} - if _globals is None: - _globals = sys._getframe(1).f_globals - input = self._preprocess(input, filepath, _context, _globals) - #: create template object. - template = self._create_template( - input, filepath, _context, _globals) - #: set timestamp and filename of template object. - template.timestamp = timestamp - template._last_checked_at = _time() - #: save template object into cache. - if cache: - if not template.bytecode: - #: ignores syntax error when compiling. - try: - template.compile() - except SyntaxError: - pass - cache.set(cachepath, template) - # else: - # template.compile() - #: - template.filename = filepath - return template - - def include(self, template_name, append_to_buf=True, **kwargs): - """Evaluate template using current local variables as context. - - template_name:str - Filename (ex. 'user_list.pyhtml') or short name (ex. ':list') of template. - append_to_buf:boolean (=True) - If True then append output into _buf and return None, - else return stirng output. - - ex. - - #{include('file.pyhtml', False)} - - """ - #: get local and global vars of caller. - frame = sys._getframe(1) - locals = frame.f_locals - globals = frame.f_globals - #: get _context from caller's local vars. - assert '_context' in locals - context = locals['_context'] - #: if kwargs specified then add them into context. - if kwargs: - context.update(kwargs) - #: get template object with context data and global vars. - # (context and globals are passed to get_template() only for preprocessing.) - template = self.get_template(template_name, context, globals) - #: if append_to_buf is true then add output to _buf. - #: if append_to_buf is false then don't add output to _buf. - if append_to_buf: - _buf = locals['_buf'] - else: - _buf = None - #: render template and return output. - s = template.render(context, globals, _buf=_buf) - #: kwargs are removed from context data. - if kwargs: - for k in kwargs: - del context[k] - return s - - def render(self, template_name, context=None, globals=None, layout=True): - """Evaluate template with layout file and return result of evaluation. - - template_name:str - Filename (ex. 'user_list.pyhtml') or short name (ex. ':list') of template. - context:dict (=None) - Context object to evaluate. If None then new dict is used. - globals:dict (=None) - Global context to evaluate. If None then globals() is used. - layout:str or Bool(=True) - If True, the default layout name specified in constructor is used. - If False, no layout template is used. - If str, it is regarded as layout template name. - - If temlate object related with the 'template_name' argument is not exist, - engine generates a template object and register it automatically. - """ - if context is None: - context = {} - if globals is None: - globals = sys._getframe(1).f_globals - self.hook_context(context) - while True: - # context and globals are passed to get_template() only for preprocessing - template = self.get_template(template_name, context, globals) - content = template.render(context, globals) - layout = context.pop('_layout', layout) - if layout is True or layout is None: - layout = self.layout - if not layout: - break - template_name = layout - layout = False - context['_content'] = content - context.pop('_content', None) - return content - - def hook_context(self, context): - #: add engine itself into context data. - context['_engine'] = self - #context['render'] = self.render - #: add include() method into context data. - context['include'] = self.include - - -## -# safe template and engine -## - -class SafeTemplate(Template): - """Uses 'to_escaped()' instead of 'escape()'. - '#{...}' is not allowed with this class. Use '[==...==]' instead. - """ - - tostrfunc = 'to_str' - escapefunc = 'to_escaped' - - def get_expr_and_flags(self, match): - return _get_expr_and_flags(match, "#{%s}: '#{}' is not allowed with SafeTemplate.") - - -class SafePreprocessor(Preprocessor): - - tostrfunc = 'to_str' - escapefunc = 'to_escaped' - - def get_expr_and_flags(self, match): - return _get_expr_and_flags(match, "#{{%s}}: '#{{}}' is not allowed with SafePreprocessor.") - - -def _get_expr_and_flags(match, errmsg): - expr1, expr2, expr3, expr4 = match.groups() - if expr1 is not None: - raise TemplateSyntaxError(errmsg % match.group(1)) - if expr2 is not None: - return expr2, (True, False) # #{...} : call escape, not to_str - if expr3 is not None: - return expr3, (False, True) # [==...==] : not escape, call to_str - if expr4 is not None: - return expr4, (True, False) # [=...=] : call escape, not to_str - - -class SafeEngine(Engine): - - templateclass = SafeTemplate - preprocessorclass = SafePreprocessor - - -## -# for Google App Engine -# (should separate into individual file or module?) -## - -def _dummy(): - global memcache, _tenjin - memcache = _tenjin = None # lazy import of google.appengine.api.memcache - global GaeMemcacheCacheStorage, GaeMemcacheStore, init - - class GaeMemcacheCacheStorage(CacheStorage): - - lifetime = 0 # 0 means unlimited - - def __init__(self, lifetime=None, namespace=None): - CacheStorage.__init__(self) - if lifetime is not None: - self.lifetime = lifetime - self.namespace = namespace - - def _load(self, cachepath): - key = cachepath - if _tenjin.logger: - _tenjin.logger.info( - "[tenjin.gae.GaeMemcacheCacheStorage] load cache (key=%r)" % (key, )) - return memcache.get(key, namespace=self.namespace) - - def _store(self, cachepath, dct): - dct.pop('bytecode', None) - key = cachepath - if _tenjin.logger: - _tenjin.logger.info( - "[tenjin.gae.GaeMemcacheCacheStorage] store cache (key=%r)" % (key, )) - ret = memcache.set(key, dct, self.lifetime, - namespace=self.namespace) - if not ret: - if _tenjin.logger: - _tenjin.logger.info( - "[tenjin.gae.GaeMemcacheCacheStorage] failed to store cache (key=%r)" % (key, )) - - def _delete(self, cachepath): - key = cachepath - memcache.delete(key, namespace=self.namespace) - - class GaeMemcacheStore(KeyValueStore): - - lifetime = 0 - - def __init__(self, lifetime=None, namespace=None): - if lifetime is not None: - self.lifetime = lifetime - self.namespace = namespace - - def get(self, key): - return memcache.get(key, namespace=self.namespace) - - def set(self, key, value, lifetime=None): - if lifetime is None: - lifetime = self.lifetime - if memcache.set(key, value, lifetime, namespace=self.namespace): - return True - else: - if _tenjin.logger: - _tenjin.logger.info( - "[tenjin.gae.GaeMemcacheStore] failed to set (key=%r)" % (key, )) - return False - - def delete(self, key): - return memcache.delete(key, namespace=self.namespace) - - def has(self, key): - if memcache.add(key, 'dummy', namespace=self.namespace): - memcache.delete(key, namespace=self.namespace) - return False - else: - return True - - def init(): - global memcache, _tenjin - if not memcache: - from google.appengine.api import memcache - if not _tenjin: - import tenjin as _tenjin - # avoid cache confliction between versions - ver = os.environ.get('CURRENT_VERSION_ID', '1.1') # .split('.')[0] - Engine.cache = GaeMemcacheCacheStorage(namespace=ver) - # set fragment cache store - helpers.fragment_cache.store = GaeMemcacheStore(namespace=ver) - helpers.fragment_cache.lifetime = 60 # 1 minute - helpers.fragment_cache.prefix = 'fragment.' - - -gae = create_module('tenjin.gae', _dummy, - os=os, helpers=helpers, Engine=Engine, - CacheStorage=CacheStorage, KeyValueStore=KeyValueStore) - - -del _dummy diff --git a/cgi/weabot.py b/cgi/weabot.py index 720916d..636eb02 100755 --- a/cgi/weabot.py +++ b/cgi/weabot.py @@ -23,7 +23,7 @@ from formatting import * from post import * from img import * -__version__ = "0.10.0" +__version__ = "0.10.5" # Set to True to disable weabot's exception routing and enable profiling _DEBUG = False @@ -35,7 +35,7 @@ class weabot(object): def __init__(self, environ, start_response): global _DEBUG - logging.basicConfig(filename='weabot.log', format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG) + logging.basicConfig(filename='weabot.log', format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO) self.environ = environ if self.environ["PATH_INFO"].startswith("/weabot.py/"): @@ -85,7 +85,7 @@ class weabot(object): def error(self, message): board = Settings._.BOARD if board: - if board['board_type'] == '1': + if board['board_type'] == 1: info = {} info['host'] = self.environ["REMOTE_ADDR"] info['name'] = self.formdata.get('fielda', '') @@ -265,7 +265,7 @@ class weabot(object): if Settings.ENABLE_BANS and addressIsBanned(self.environ['REMOTE_ADDR'], board["dir"], blind_only=True): raise UserError('' % board["dir"]) - if len(path_split) > 4 and path_split[4] and board['board_type'] == '1': + if len(path_split) > 4 and path_split[4] and board['board_type'] == 1: self.output = dynamicRead(int(path_split[3]), path_split[4], True) elif board['board_type'] == 1: self.output = threadPage(0, True, int(path_split[3])) @@ -329,12 +329,12 @@ class weabot(object): self.output += '

    ...

    ' % url elif path_split[1] == "banned": OpenDb() - bans = FetchAll("SELECT * FROM `bans` WHERE INET6_ATON('"+self.environ["REMOTE_ADDR"]+"') BETWEEN `ipstart` AND `ipend`") + bans = FetchAll("SELECT * FROM `bans` WHERE INET6_ATON(%s) BETWEEN `ipstart` AND `ipend`", (self.environ["REMOTE_ADDR"],)) if bans: for ban in bans: if ban["boards"]: - boards = pickle.loads(ban["boards"]) - if ban["boards"] or path_split[2] in boards: + boards = str2boards(ban["boards"]) + if not ban["boards"] or path_split[2] in boards: caught = True if ban["boards"]: boards_str = '/' + '/, /'.join(boards) + '/' @@ -592,7 +592,7 @@ class weabot(object): # make ID hash if board["useid"]: post["timestamp_formatted"] += ' ID:' + iphash(ip, post, tim, board["useid"], mobile, - self.environ["HTTP_USER_AGENT"], cap_id, hide_end, (board["countrycode"] in ['1', '2'])) + self.environ["HTTP_USER_AGENT"], cap_id, hide_end, (board["countrycode"] in [1, 2])) # use for future file checks xfile = (file is not None or oek_file) @@ -793,7 +793,13 @@ class weabot(object): postid = post.insert() # delete threads that have crossed last page - trimThreads() + trimmed = trimThreads() + + # let's stop here if the thread we posted in got trimmed + if post["parentid"] and post["parentid"] in trimmed: + regenerateFrontPages() + regenerateHome() + raise UserError("El hilo en el que publicaste ya fue eliminado.") # fix null references when creating thread if board["board_type"] == 1 and not post["parentid"]: @@ -947,8 +953,8 @@ class weabot(object): raise UserError(_("You're banned.")) # check if post exists - post = FetchOne("SELECT `id`, `parentid`, `ip` FROM `posts` WHERE `id` = '%s' AND `boardid` = '%s'" % ( - _mysql.escape_string(str(postid)), _mysql.escape_string(board['id']))) + post = FetchOne("SELECT `id`, `parentid`, `ip` FROM `posts` WHERE `id` = %s AND `boardid` = %s", + (postid, board['id'])) if not post: raise UserError(_("Post doesn't exist.")) @@ -963,13 +969,12 @@ class weabot(object): # insert report t = time.time() - message = cgi.escape(self.formdata["reason"]).strip()[0:8000] + message = html.escape(self.formdata["reason"]).strip()[0:800] message = message.replace("\n", "
    ") UpdateDb("INSERT INTO `reports` (board, postid, parentid, link, ip, reason, repip, timestamp, timestamp_formatted) " + - "VALUES ('%s', '%s', '%s', '%s', '%s', '%s', INET6_ATON('%s'), '%s', '%s')" % ( - board["dir"], post['id'], post['parentid'], link, post['ip'], _mysql.escape_string(message), - _mysql.escape_string(self.environ["REMOTE_ADDR"]), str(t), formatTimestamp(t))) + "VALUES (%s, %s, %s, %s, %s, %s, INET6_ATON(%s), %s, %s)", + (board["dir"], post['id'], post['parentid'], link, post['ip'], message, self.environ["REMOTE_ADDR"], t, formatTimestamp(t))) self.output = renderTemplate("report.html", {'finished': True}) def stats(self): -- cgit v1.2.1-18-gbd029