You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

380 lines
15 KiB

# -*- coding: iso-8859-1 -*-
""" A SAX2 driver for libxml2, on top of it's XmlReader API
USAGE
# put this file (drv_libxml2.py) in PYTHONPATH
import xml.sax
reader = xml.sax.make_parser(["drv_libxml2"])
# ...and the rest is standard python sax.
CAVEATS
- Lexical handlers are supported, except for start/endEntity
(waiting for XmlReader.ResolveEntity) and start/endDTD
- Error callbacks are not exactly synchronous, they tend
to be invoked before the corresponding content callback,
because the underlying reader interface parses
data by chunks of 512 bytes
TODO
- search for TODO
- some ErrorHandler events (warning)
- some ContentHandler events (setDocumentLocator, skippedEntity)
- EntityResolver (using libxml2.?)
- DTDHandler (if/when libxml2 exposes such node types)
- DeclHandler (if/when libxml2 exposes such node types)
- property_xml_string?
- feature_string_interning?
- Incremental parser
- additional performance tuning:
- one might cache callbacks to avoid some name lookups
- one might implement a smarter way to pass attributes to startElement
(some kind of lazy evaluation?)
- there might be room for improvement in start/endPrefixMapping
- other?
"""
__author__ = "Stéphane Bidoul <sbi@skynet.be>"
__version__ = "0.3"
import sys
import codecs
if sys.version_info[0] < 3:
__author__ = codecs.unicode_escape_decode(__author__)[0]
StringTypes = (str, unicode)
# libxml2 returns strings as UTF8
_decoder = codecs.lookup("utf8")[1]
def _d(s):
if s is None:
return s
else:
return _decoder(s)[0]
else:
StringTypes = str
# s is Unicode `str` already
def _d(s):
return s
from xml.sax._exceptions import *
from xml.sax import xmlreader, saxutils
from xml.sax.handler import \
feature_namespaces, \
feature_namespace_prefixes, \
feature_string_interning, \
feature_validation, \
feature_external_ges, \
feature_external_pes, \
property_lexical_handler, \
property_declaration_handler, \
property_dom_node, \
property_xml_string
try:
import libxml2
except ImportError:
raise SAXReaderNotAvailable("libxml2 not available: " \
"import error was: %s" % sys.exc_info()[1])
class Locator(xmlreader.Locator):
"""SAX Locator adapter for libxml2.xmlTextReaderLocator"""
def __init__(self,locator):
self.__locator = locator
def getColumnNumber(self):
"Return the column number where the current event ends."
return -1
def getLineNumber(self):
"Return the line number where the current event ends."
return self.__locator.LineNumber()
def getPublicId(self):
"Return the public identifier for the current event."
return None
def getSystemId(self):
"Return the system identifier for the current event."
return self.__locator.BaseURI()
class LibXml2Reader(xmlreader.XMLReader):
def __init__(self):
xmlreader.XMLReader.__init__(self)
# features
self.__ns = 0
self.__nspfx = 0
self.__validate = 0
self.__extparams = 1
# parsing flag
self.__parsing = 0
# additional handlers
self.__lex_handler = None
self.__decl_handler = None
# error messages accumulator
self.__errors = None
def _errorHandler(self,arg,msg,severity,locator):
if self.__errors is None:
self.__errors = []
self.__errors.append((severity,
SAXParseException(msg,None,
Locator(locator))))
def _reportErrors(self,fatal):
for severity,exception in self.__errors:
if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
libxml2.PARSER_SEVERITY_WARNING):
self._err_handler.warning(exception)
else:
# when fatal is set, the parse will stop;
# we consider that the last error reported
# is the fatal one.
if fatal and exception is self.__errors[-1][1]:
self._err_handler.fatalError(exception)
else:
self._err_handler.error(exception)
self.__errors = None
def parse(self, source):
self.__parsing = 1
try:
# prepare source and create reader
if isinstance(source, StringTypes):
reader = libxml2.newTextReaderFilename(source)
else:
source = saxutils.prepare_input_source(source)
input = libxml2.inputBuffer(source.getByteStream())
reader = input.newTextReader(source.getSystemId())
reader.SetErrorHandler(self._errorHandler,None)
# configure reader
if self.__extparams:
reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
else:
reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
# we reuse attribute maps (for a slight performance gain)
if self.__ns:
attributesNSImpl = xmlreader.AttributesNSImpl({},{})
else:
attributesImpl = xmlreader.AttributesImpl({})
# prefixes to pop (for endPrefixMapping)
prefixes = []
# start loop
self._cont_handler.startDocument()
while 1:
r = reader.Read()
# check for errors
if r == 1:
if not self.__errors is None:
self._reportErrors(0)
elif r == 0:
if not self.__errors is None:
self._reportErrors(0)
break # end of parse
else:
if not self.__errors is None:
self._reportErrors(1)
else:
self._err_handler.fatalError(\
SAXException("Read failed (no details available)"))
break # fatal parse error
# get node type
nodeType = reader.NodeType()
# Element
if nodeType == 1:
if self.__ns:
eltName = (_d(reader.NamespaceUri()),\
_d(reader.LocalName()))
eltQName = _d(reader.Name())
attributesNSImpl._attrs = attrs = {}
attributesNSImpl._qnames = qnames = {}
newPrefixes = []
while reader.MoveToNextAttribute():
qname = _d(reader.Name())
value = _d(reader.Value())
if qname.startswith("xmlns"):
if len(qname) > 5:
newPrefix = qname[6:]
else:
newPrefix = None
newPrefixes.append(newPrefix)
self._cont_handler.startPrefixMapping(\
newPrefix,value)
if not self.__nspfx:
continue # don't report xmlns attribute
attName = (_d(reader.NamespaceUri()),
_d(reader.LocalName()))
qnames[attName] = qname
attrs[attName] = value
reader.MoveToElement()
self._cont_handler.startElementNS( \
eltName,eltQName,attributesNSImpl)
if reader.IsEmptyElement():
self._cont_handler.endElementNS(eltName,eltQName)
for newPrefix in newPrefixes:
self._cont_handler.endPrefixMapping(newPrefix)
else:
prefixes.append(newPrefixes)
else:
eltName = _d(reader.Name())
attributesImpl._attrs = attrs = {}
while reader.MoveToNextAttribute():
attName = _d(reader.Name())
attrs[attName] = _d(reader.Value())
reader.MoveToElement()
self._cont_handler.startElement( \
eltName,attributesImpl)
if reader.IsEmptyElement():
self._cont_handler.endElement(eltName)
# EndElement
elif nodeType == 15:
if self.__ns:
self._cont_handler.endElementNS( \
(_d(reader.NamespaceUri()),_d(reader.LocalName())),
_d(reader.Name()))
for prefix in prefixes.pop():
self._cont_handler.endPrefixMapping(prefix)
else:
self._cont_handler.endElement(_d(reader.Name()))
# Text
elif nodeType == 3:
self._cont_handler.characters(_d(reader.Value()))
# Whitespace
elif nodeType == 13:
self._cont_handler.ignorableWhitespace(_d(reader.Value()))
# SignificantWhitespace
elif nodeType == 14:
self._cont_handler.characters(_d(reader.Value()))
# CDATA
elif nodeType == 4:
if not self.__lex_handler is None:
self.__lex_handler.startCDATA()
self._cont_handler.characters(_d(reader.Value()))
if not self.__lex_handler is None:
self.__lex_handler.endCDATA()
# EntityReference
elif nodeType == 5:
if not self.__lex_handler is None:
self.startEntity(_d(reader.Name()))
reader.ResolveEntity()
# EndEntity
elif nodeType == 16:
if not self.__lex_handler is None:
self.endEntity(_d(reader.Name()))
# ProcessingInstruction
elif nodeType == 7:
self._cont_handler.processingInstruction( \
_d(reader.Name()),_d(reader.Value()))
# Comment
elif nodeType == 8:
if not self.__lex_handler is None:
self.__lex_handler.comment(_d(reader.Value()))
# DocumentType
elif nodeType == 10:
#if not self.__lex_handler is None:
# self.__lex_handler.startDTD()
pass # TODO (how to detect endDTD? on first non-dtd event?)
# XmlDeclaration
elif nodeType == 17:
pass # TODO
# Entity
elif nodeType == 6:
pass # TODO (entity decl)
# Notation (decl)
elif nodeType == 12:
pass # TODO
# Attribute (never in this loop)
#elif nodeType == 2:
# pass
# Document (not exposed)
#elif nodeType == 9:
# pass
# DocumentFragment (never returned by XmlReader)
#elif nodeType == 11:
# pass
# None
#elif nodeType == 0:
# pass
# -
else:
raise SAXException("Unexpected node type %d" % nodeType)
if r == 0:
self._cont_handler.endDocument()
reader.Close()
finally:
self.__parsing = 0
def setDTDHandler(self, handler):
# TODO (when supported, the inherited method works just fine)
raise SAXNotSupportedException("DTDHandler not supported")
def setEntityResolver(self, resolver):
# TODO (when supported, the inherited method works just fine)
raise SAXNotSupportedException("EntityResolver not supported")
def getFeature(self, name):
if name == feature_namespaces:
return self.__ns
elif name == feature_namespace_prefixes:
return self.__nspfx
elif name == feature_validation:
return self.__validate
elif name == feature_external_ges:
return 1 # TODO (does that relate to PARSER_LOADDTD)?
elif name == feature_external_pes:
return self.__extparams
else:
raise SAXNotRecognizedException("Feature '%s' not recognized" % \
name)
def setFeature(self, name, state):
if self.__parsing:
raise SAXNotSupportedException("Cannot set feature %s " \
"while parsing" % name)
if name == feature_namespaces:
self.__ns = state
elif name == feature_namespace_prefixes:
self.__nspfx = state
elif name == feature_validation:
self.__validate = state
elif name == feature_external_ges:
if state == 0:
# TODO (does that relate to PARSER_LOADDTD)?
raise SAXNotSupportedException("Feature '%s' not supported" % \
name)
elif name == feature_external_pes:
self.__extparams = state
else:
raise SAXNotRecognizedException("Feature '%s' not recognized" % \
name)
def getProperty(self, name):
if name == property_lexical_handler:
return self.__lex_handler
elif name == property_declaration_handler:
return self.__decl_handler
else:
raise SAXNotRecognizedException("Property '%s' not recognized" % \
name)
def setProperty(self, name, value):
if name == property_lexical_handler:
self.__lex_handler = value
elif name == property_declaration_handler:
# TODO: remove if/when libxml2 supports dtd events
raise SAXNotSupportedException("Property '%s' not supported" % \
name)
self.__decl_handler = value
else:
raise SAXNotRecognizedException("Property '%s' not recognized" % \
name)
def create_parser():
return LibXml2Reader()