# -*- coding: cp949 -*-
import re, xml.dom.minidom, HTMLParser
import codeshift

class parse_base_v3(object):
    def __init__(self, parseText):
        _parseText    = self._make_neat_xml(parseText)
        self.document = xml.dom.minidom.parseString(_parseText)

    # get text data.
    # item: Parent node of element_name. This node will be the top node to be searched.
    #       Only the first element_name element within item node will be chosen.
    #       If item node is None, this fuction returns an empty string.
    #
    # element_name: Target element that contains a text node.
    #               It must have the only child, and the only child should be a text node.
    #               If not, this function returns an empty string.
    def _get_text_data(self, item, element_name):

        # html entity 제거용 모듈
        par      = HTMLParser.HTMLParser()
        unescape = par.unescape

        e = item.getElementsByTagName(element_name)
        if e is None:
            return u''

        element = e[0]
        if element is not None and \
           element.firstChild is not None and \
           (element.firstChild.nodeType == xml.dom.Node.CDATA_SECTION_NODE or \
            element.firstChild.nodeType == xml.dom.Node.TEXT_NODE):
            return unescape(element.firstChild.data.strip())
        else:
            return u''

    # remove all whitespaces between two adjacent tags.
    def _make_neat_xml(self, text):
        return re.sub(r'> +<', codeshift.uni2utf8(u'><'), re.sub(r'\s', codeshift.uni2utf8(u' '),  text))

    @property
    def document(self):
        return self.__document

    @document.setter
    def document(self, doc):
        self.__document = doc

    @property
    def result(self):
        return self.__result

    __document = None
    __result   = {}