# -*- coding: cp949 -*- import re, xml.dom.minidom, HTMLParser import codeshift class parse_base_v3(object): def __init__(self, parseText): _parseText = self._make_neat_xml(parseText) self.document = xml.dom.minidom.parseString(_parseText) # get text data. # item: Parent node of element_name. This node will be the top node to be searched. # Only the first element_name element within item node will be chosen. # If item node is None, this fuction returns an empty string. # # element_name: Target element that contains a text node. # It must have the only child, and the only child should be a text node. # If not, this function returns an empty string. def _get_text_data(self, item, element_name): # html entity 제거용 모듈 par = HTMLParser.HTMLParser() unescape = par.unescape e = item.getElementsByTagName(element_name) if e is None: return u'' element = e[0] if element is not None and \ element.firstChild is not None and \ (element.firstChild.nodeType == xml.dom.Node.CDATA_SECTION_NODE or \ element.firstChild.nodeType == xml.dom.Node.TEXT_NODE): return unescape(element.firstChild.data.strip()) else: return u'' # remove all whitespaces between two adjacent tags. def _make_neat_xml(self, text): return re.sub(r'> +<', codeshift.uni2utf8(u'><'), re.sub(r'\s', codeshift.uni2utf8(u' '), text)) @property def document(self): return self.__document @document.setter def document(self, doc): self.__document = doc @property def result(self): return self.__result __document = None __result = {}