0001"""Infoset serialization formats (XML, XHTML, HTML, etc)"""
0002
0003from __future__ import generators
0004
0005__revision__ = "$Rev$"
0006__date__ = "$Date: 2005-02-16 15:43:38 -0500 (Wed, 16 Feb 2005) $"
0007__author__ = "Ryan Tomayko (rtomayko@gmail.com)"
0008__copyright__ = "Copyright 2004-2005, Ryan Tomayko"
0009__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"
0010
0011import re
0012
0013from kid.et import *
0014from kid.pull import *
0015from kid.pull import _coalesce
0016
0017
0018import kid.namespace as namespace
0019
0020__all__ = ['doctypes', 'Serializer', 'XMLSerializer', 'HTMLSerializer']
0021
0022
0023
0024doctypes = {
0025 'html-strict' : ('HTML', '-//W3C//DTD HTML 4.01//EN',
0026 'http://www.w3.org/TR/html4/strict.dtd'),
0027 'html' : ('HTML', '-//W3C//DTD HTML 4.01 Transitional//EN',
0028 'http://www.w3.org/TR/html4/loose.dtd'),
0029 'xhtml-strict' : ('html', '-//W3C//DTD XHTML 1.0 Strict//EN',
0030 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'),
0031 'xhtml' : ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
0032 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd') }
0033
0034
0035class Serializer(object):
0036
0037 namespaces = namespace.namespaces
0038 encoding = 'utf-8'
0039 balanced_blocks = 1
0040 strip_whitespace = 0
0041
0042 def __init__(self, encoding=None, src_encoding="utf-8"):
0043 if encoding is not None:
0044 self.encoding = encoding
0045 self.src_encoding = src_encoding
0046
0047 def has_only_pcdata(self, tagname):
0048 return False
0049
0050 def serialize(self, stream, encoding=None, fragment=0):
0051 text = list(self.generate(stream, encoding, fragment))
0052 return ''.join(text)
0053
0054 def write(self, stream, file, encoding=None, fragment=0):
0055 if not hasattr(file, 'write'):
0056 file = open(file, 'wb')
0057 write = file.write
0058 for text in self.generate(stream, encoding, fragment):
0059 write(text)
0060
0061 def generate(self, stream, encoding=None, fragment=0):
0062 pass
0063
0064 def apply_filters(self, stream):
0065 stream = _coalesce(stream, self.src_encoding)
0066 if self.strip_whitespace:
0067 stream = self.whitespace_filter(stream)
0068 else:
0069 if self.balanced_blocks:
0070 stream = self.balancing_filter(stream)
0071 return stream
0072
0073 def balancing_filter(self, stream):
0074 line_collapse = re.compile('\n{2,}')
0075 text = ''
0076 hops = 0
0077 for ev, item in stream:
0078 if ev == TEXT:
0079 text = item
0080 hops = 0
0081 elif ev in (START, END) and item.tag != Fragment:
0082 if hops > 0:
0083 if text and text.strip() == '':
0084 yield (TEXT, line_collapse.sub('\n', text))
0085 elif text:
0086 if text.strip() == '':
0087 yield (TEXT, line_collapse.sub('\n', text))
0088 else:
0089 yield (TEXT, text)
0090 yield (ev, item)
0091 hops+=1
0092
0093
0094 if ev == START and self.has_only_pcdata(item.tag):
0095 text = ''
0096 else:
0097 yield (ev, item)
0098
0099 def whitespace_filter(self, stream):
0100 for ev, item in stream:
0101 if ev == TEXT:
0102 yield (TEXT, item.strip())
0103 else:
0104 yield (ev, item)
0105
0106class XMLSerializer(Serializer):
0107
0108 decl = 1
0109 doctype = None
0110 cdata_elements = []
0111
0112 def __init__(self, encoding=None, decl=None, doctype=None,
0113 namespaces=None):
0114 Serializer.__init__(self, encoding)
0115 if decl is not None:
0116 self.decl = decl
0117 if doctype is not None:
0118 self.doctype = doctype
0119 if isinstance(self.doctype, (str, unicode)):
0120
0121 self.doctype = doctypes[self.doctype]
0122 if namespaces:
0123 self.namespaces = namespaces
0124
0125 def can_be_empty_element(self, ns_stack, item_name):
0126 return True
0127
0128 def generate(self, stream, encoding=None, fragment=0):
0129 """Serializes an event stream to bytes of the specified encoding.
0130
0131 This function yields an encoded string over and over until the
0132 stream is exhausted.
0133
0134 """
0135
0136 encoding = encoding or self.encoding or 'utf-8'
0137 escape_cdata = XMLSerializer.escape_cdata
0138 escape_attrib = XMLSerializer.escape_attrib
0139
0140 lastev = None
0141 stream = iter(stream)
0142 names = NamespaceStack(self.namespaces)
0143 if not fragment:
0144 if self.decl:
0145 yield '<?xml version="1.0" encoding="%s"?>\n' % encoding
0146 if self.doctype is not None:
0147 yield serialize_doctype(self.doctype) + '\n'
0148 text = None
0149 for ev, item in self.apply_filters(stream):
0150 if ev in (START, END) and item.tag == Fragment:
0151 continue
0152 elif ev == TEXT:
0153 if text is not None:
0154 text = u''.join([text, item])
0155 else:
0156 text = item
0157 continue
0158 if lastev == START:
0159 if ev == END and (not text or not text.strip()) and self.can_be_empty_element(names, item.tag):
0160 yield ' />'
0161 lastev = END
0162 text = None
0163 names.pop()
0164 continue
0165 yield ">"
0166 if text:
0167 yield escape_cdata(text, encoding)
0168 text = None
0169 if ev == START:
0170 if item.tag == Comment:
0171 yield "<!--%s-->" % item.text.encode(encoding)
0172 lastev = COMMENT
0173 continue
0174 elif item.tag == ProcessingInstruction:
0175 yield "<?%s?>" % item.text.encode(encoding)
0176 lastev = PI
0177 continue
0178 else:
0179 tag = item.tag
0180 names.push(namespaces(item, remove=1))
0181 qname = names.qname(tag, default=1)
0182 yield "<" + qname.encode(encoding)
0183 attrs = item.attrib.items()
0184 if attrs:
0185 for k, v in attrs:
0186 qname = names.qname(k, default=0)
0187 yield ' %s="%s"' % (qname.encode(encoding),
0188 escape_attrib(v, encoding))
0189 for prefix, uri in names.current.items():
0190 if prefix == '':
0191 yield ' xmlns="%s"' % escape_attrib(uri, encoding)
0192 else:
0193 yield ' xmlns:%s="%s"' % (prefix.encode(encoding),
0194 escape_attrib(uri, encoding))
0195 elif ev == END and item.tag not in (Comment, ProcessingInstruction):
0196 qname = names.qname(item.tag, default=1)
0197 yield "</%s>" % qname.encode(encoding)
0198 names.pop()
0199 lastev = ev
0200 return
0201
0202 def escape_cdata(text, encoding=None):
0203 """Escape character data."""
0204 try:
0205 if encoding:
0206 try:
0207 text = text.encode(encoding)
0208 except UnicodeError:
0209 return _encode_entity(text)
0210 text = text.replace("&", "&")
0211 text = text.replace("<", "<")
0212 return text
0213 except (TypeError, AttributeError):
0214 _raise_serialization_error(text)
0215 escape_cdata = staticmethod(escape_cdata)
0216
0217 def escape_attrib(text, encoding=None):
0218 """Escape attribute value."""
0219 try:
0220 if encoding:
0221 try:
0222 text = text.encode(encoding)
0223 except UnicodeError:
0224 return _encode_entity(text)
0225 text = text.replace("&", "&")
0226 text = text.replace("<", "<")
0227 text = text.replace("\"", """)
0228 return text
0229 except (TypeError, AttributeError):
0230 _raise_serialization_error(text)
0231 escape_attrib = staticmethod(escape_attrib)
0232
0233
0234
0235try:
0236 set
0237except NameError:
0238 try:
0239 from sets import Set as set
0240 except ImportError:
0241 def set(seq):
0242 return seq
0243
0244import kid.namespace as namespace
0245xhtml = namespace.xhtml.uri
0246import string
0247
0248class HTMLSerializer(Serializer):
0249
0250 doctype = doctypes['html']
0251 transpose = string.upper
0252 transpose = staticmethod(transpose)
0253 inject_type = 1
0254 empty_elements = set(['area', 'base', 'basefont', 'br', 'col', 'frame',
0255 'hr', 'img', 'input', 'isindex', 'link', 'meta',
0256 'param'])
0257
0258 elements_with_pcdata = set(['option', 'textarea', 'fieldset', 'title'])
0259 noescape_elements = set(['script', 'style'])
0260 boolean_attributes = set(['selected', 'checked', 'compact', 'declare',
0261 'defer', 'disabled', 'ismap', 'multiple', 'nohref',
0262 'noresize', 'noshade', 'nowrap'])
0263
0264 def __init__(self, encoding='utf-8', doctype=None, transpose=None):
0265 Serializer.__init__(self, encoding)
0266 if doctype:
0267 self.doctype = doctype
0268 if isinstance(self.doctype, (str, unicode)):
0269
0270 self.doctype = doctypes[self.doctype]
0271 if transpose:
0272 self.transpose = transpose
0273
0274 def has_only_pcdata(self, tagname):
0275 if isinstance(tagname, (str,unicode)) and tagname[0] == '{':
0276 tagname = tagname.split('}')[1]
0277 return tagname in self.elements_with_pcdata
0278
0279 def generate(self, stream, encoding=None, fragment=0):
0280 """Serializes an event stream to bytes of the specified encoding.
0281
0282 This function yields an encoded string over and over until the
0283 stream is exhausted.
0284
0285 """
0286
0287 encoding = encoding or self.encoding or 'utf-8'
0288
0289 escape_cdata = HTMLSerializer.escape_cdata
0290 escape_attrib = HTMLSerializer.escape_attrib
0291 noescape_elements = self.noescape_elements
0292 boolean_attributes = self.boolean_attributes
0293 empty_elements = self.empty_elements
0294
0295 names = NamespaceStack(self.namespaces)
0296
0297 def grok_name(tag):
0298 if tag[0] == '{':
0299 uri, localname = tag[1:].split('}', 1)
0300 else:
0301 uri, localname = None, tag
0302 if uri and uri != xhtml:
0303 qname = names.qname(tag, default=0)
0304 else:
0305 qname = localname
0306 if self.transpose is not None:
0307 qname = self.transpose(qname)
0308 return (uri, localname, qname)
0309
0310 if self.transpose:
0311 attr_http_equiv = self.transpose('http-equiv')
0312 attr_content = self.transpose('content')
0313
0314 current = None
0315 stack = [current]
0316 stream = iter(stream)
0317 if not fragment and self.doctype is not None:
0318 yield serialize_doctype(self.doctype) + '\n'
0319 for ev, item in self.apply_filters(stream):
0320 if ev == TEXT and item:
0321 escape = current not in noescape_elements
0322 yield escape_cdata(item, encoding, escape)
0323 elif ev == START:
0324 if item.tag == Comment:
0325 yield "<!--%s-->" % item.text.encode(encoding)
0326 lastev = COMMENT
0327 continue
0328 elif item.tag == ProcessingInstruction:
0329 yield "<?%s>" % item.text.encode(encoding)
0330 lastev = PI
0331 continue
0332 elif item.tag == Fragment:
0333 continue
0334 else:
0335 names.push(namespaces(item, remove=1))
0336 tag = item.tag
0337 (uri, localname, qname) = grok_name(tag)
0338
0339
0340 current = qname.lower()
0341 stack.append(current)
0342
0343 yield "<" + qname.encode(encoding)
0344 attrs = item.attrib.items()
0345 if attrs:
0346 for k, v in attrs:
0347 (u, l, q) = grok_name(k)
0348 lq = q.lower()
0349 if lq == 'xml:lang': continue
0350 if lq in boolean_attributes:
0351
0352
0353 yield ' %s' % q.encode(encoding)
0354 else:
0355 yield ' %s="%s"' % (q.encode(encoding),
0356 escape_attrib(v, encoding))
0357 yield ">"
0358 if self.inject_type:
0359 if current == 'head':
0360 (uri, localname, qname) = grok_name("meta")
0361 yield '<%s %s="text/html; charset=%s"' ' %s="Content-Type">' % (qname.encode(encoding),
0364 attr_content,
0365 encoding,
0366 attr_http_equiv)
0367
0368 elif ev == END and item.tag not in (Comment,
0369 ProcessingInstruction,
0370 Fragment):
0371 current = stack.pop()
0372 if current not in empty_elements:
0373 tag = item.tag
0374 (uri, localname, qname) = grok_name(tag)
0375 yield "</%s>" % qname.encode(encoding)
0376 current = stack[-1]
0377 names.pop()
0378 return
0379
0380 def escape_cdata(text, encoding=None, escape=1):
0381 """Escape character data."""
0382 try:
0383 if encoding:
0384 try:
0385 text = text.encode(encoding)
0386 except UnicodeError:
0387 return _encode_entity(text)
0388 if escape:
0389 text = text.replace("&", "&")
0390 text = text.replace("<", "<")
0391 return text
0392 except (TypeError, AttributeError):
0393 _raise_serialization_error(text)
0394 escape_cdata = staticmethod(escape_cdata)
0395
0396 def escape_attrib(text, encoding=None):
0397 """Escape attribute value."""
0398 try:
0399 if encoding:
0400 try:
0401 text = text.encode(encoding)
0402 except UnicodeError:
0403 return _encode_entity(text)
0404 text = text.replace("&", "&")
0405 text = text.replace("\"", """)
0406 return text
0407 except (TypeError, AttributeError):
0408 _raise_serialization_error(text)
0409 escape_attrib = staticmethod(escape_attrib)
0410
0411class XHTMLSerializer(XMLSerializer):
0412 empty_elements = [namespace.xhtml.clarkname(name) for name in HTMLSerializer.empty_elements]
0413 elements_with_pcdata = [namespace.xhtml.clarkname(name) for name in HTMLSerializer.elements_with_pcdata]
0414
0415 def can_be_empty_element(self, ns_stack, tagname):
0416 return tagname in self.empty_elements
0417
0418 def has_only_pcdata(self, tagname):
0419 return tagname in self.elements_with_pcdata
0420
0421class PlainSerializer(Serializer):
0422
0423 def generate(self, stream, encoding=None, fragment=0):
0424
0425 encoding = encoding or self.encoding or 'utf-8'
0426 for ev, item in self.apply_filters(stream):
0427 if ev == TEXT:
0428 yield item
0429
0430
0431class NamespaceStack:
0432
0433 """Maintains a stack of namespace prefix to URI mappings."""
0434
0435 def __init__(self, default_map=namespace.namespaces):
0436 self.stack = []
0437 self.default_map = default_map
0438 self.push()
0439 self.ns_count = 0
0440
0441 def push(self, names=None):
0442 if names is None:
0443 names = {}
0444 self.current = names
0445 self.stack.insert(0, self.current)
0446
0447 def pop(self):
0448 del self.stack[0]
0449 if len(self.stack):
0450 self.current = self.stack[0]
0451
0452 def resolve_prefix(self, uri, default=1):
0453 """Figure out prefix given a URI."""
0454
0455 if uri == 'http://www.w3.org/XML/1998/namespace':
0456 return 'xml'
0457
0458 is_default = -1
0459 prefix = None
0460 for names in self.stack:
0461 for k, v in names.items():
0462 if default and is_default == -1 and k == '':
0463
0464 is_default = (v == uri)
0465 if (default and is_default) or prefix:
0466 break
0467 if v == uri and k != '':
0468 prefix = k
0469 if is_default > -1:
0470 break
0471 if default and is_default == 1:
0472 return ''
0473 elif prefix:
0474 return prefix
0475 else:
0476 return None
0477
0478 def resolve_uri(self, prefix):
0479 """Figure out URI given a prefix."""
0480
0481 if prefix == 'xml':
0482 return 'http://www.w3.org/XML/1998/namespace'
0483 for names in self.stack:
0484 uri = names.get(prefix)
0485 if uri:
0486 return uri
0487 return None
0488
0489 def qname(self, cname, default=0):
0490 if isinstance(cname, QName):
0491 cname = cname.text
0492 if cname[0] != '{':
0493
0494 return cname
0495 uri, localname = cname[1:].split('}', 1)
0496 prefix = self.resolve_prefix(uri, default)
0497 if prefix is None:
0498
0499 prefix = self.default_map.get(uri)
0500 if prefix is not None:
0501 self.current[prefix] = uri
0502 else:
0503 if default and not self.current.has_key(''):
0504 prefix = ''
0505 self.current[prefix] = uri
0506 else:
0507 self.ns_count += 1
0508
0509 prefix = 'ns%d' % self.ns_count
0510 self.current[prefix] = uri
0511 if prefix != '':
0512 return '%s:%s' % (prefix, localname)
0513 else:
0514 return localname
0515
0516 def set(self, prefix, uri):
0517 if prefix is None:
0518 prefix = ''
0519 self.current[prefix] = uri
0520
0521
0522
0523
0524from kid.et import ET
0525_encode_entity = ET._encode_entity
0526_raise_serialization_error = ET._raise_serialization_error
0527
0528def serialize_doctype(doctype):
0529 return '<!DOCTYPE %s PUBLIC "%s" "%s">' % doctype