python随记

文档解析工具

2020-05-05  本文已影响0人  LCSan
#coding=utf-8
'''
Created on 2019年3月2日

@author: 瞌睡蟲子
'''
import lxml.html as dom
from cssselect import GenericTranslator
import json, jsonpath, re, js2py

def initJson(data):
    global jsn
    if type(data) == dict:
        jsn = data
    else:
        jsn = json.load(data)

def jpath(exp, data=None):
    if data != None:
        if type(data) == dict:
            js = data
        else:
            js = json.loads(data)
    else:
        js = jsn
    return jsonpath.jsonpath(js, exp)

def initDom(html):
    global docm
    try:
        docm = dom.document_fromstring(html)
    except Exception as e:
        print("Error:" + str(e))
        docm = dom.document_fromstring(html.encode())

def xpath(exp, html=None):
    if html != None:
        try:
            dm = dom.document_fromstring(html)
        except Exception as e:
            print("Error:" + str(e))
            dm = dom.document_fromstring(html.encode())
    else:
        dm = docm
    elem = dm.xpath(exp)
    if len(elem) > 0 and type(elem[0]) == dom.HtmlElement:
        return [dom.tostring(item,encoding = "utf-8").decode() for item in elem]
    else:
        return [str(item) for item in elem]

def css(exp, html=None):
    return xpath(GenericTranslator().css_to_xpath(exp), html)

def replace(reg, exp, come):
    # re.MULTILINE | re.DOTALL
    return re.sub(re.compile(reg, 8 | 16), (exp if exp.find('lambda') == -1 else eval(exp, globals(), locals())), come)

def pyeval(exp, args=None):
    return eval(exp, globals(), locals())

def createJsEval(param={}):
    return js2py.EvalJs(param)

def jsExecute(evjs, js=None, use=False):
    return evjs.execute(js, use)

def jsEval(evjs, expression, use=False):
    return evjs.eval(expression, use)

def callJsFun(evjs, fun=None, args=None):
    if fun:
        if hasattr(evjs,fun):
            if args is not None:
                return getattr(evjs, fun)(args)
            else:
                return getattr(evjs, fun)
        else:
            raise Exception("调用方法不存在")
    else:
        if args is not None:
            return evjs(args)
        else:
            return evjs()

def callJsProperty(evjs, attr=None):
    if property is None:
        return evjs
    else:
        return eval("evjs." + attr, globals(), locals())

if __name__ == "__main__":
    f = open("C:\\Users\\Administrator\\Desktop\\text.xml", encoding='UTF-8')   #打开文件
    fr = f.read()                                           #读取文件
    initDom(fr)
    a = xpath("//poi",fr)
    print(a)
    for i in a:
        print(i)
        j = xpath("//name",i)
        print(j)
        print(xpath("//name//text()",j[0]))
    print(xpath("//pname/text()"))
#     html = '''<ul>
#           
#               <li class="menuItemList">
#                   <a id="CatList_LinkList_0_Link_0" href="https://www.cnblogs.com/hiwuchong/category/1132709.html">Python编程(43)</a>
#                   <span style="width:10px">
#                       啊啊啊啊
#                   </span> 
#               </li>
#           
#               <li class="menuItemList">
#                   <a id="CatList_LinkList_0_Link_1" href="https://www.cnblogs.com/hiwuchong/category/1003142.html">R语言实战(3)</a>
#                   <span style="width:10px">
#                       大幅度发
#                   </span> 
#               </li>
#           
#               <li class="menuItemList">
#                   <a id="CatList_LinkList_0_Link_2" href="https://www.cnblogs.com/hiwuchong/category/1003145.html">SQL SERVER(4)</a>
#                   <img src="aaaaaaaaaa"></img>
#                   <span style="width:10px">
#                       大多数
#                   </span> 
#               </li>
#               
#               </ul>
#     '''
#     # initDom(html)
# #     print(xpath("//li/a/text()",html))
# #     initDom(html)
# #     print(css("li a"))
# #     initJson({"a":"123","b":123,"c":"啊哈","d":{"e":"aa"},"f":[{"a":"ad"},{"g":"df","m":"dfd"}]})
# #     print(jpath("$..f","{\"a\":1,\"f\":2}"))
# #     print(jpath("$..f[*]"))
# #     print(pyeval("[item+'aa' for item in args]", ["a","b","c"]))
# 
# #     url = 'https://113.215.20.136:9011/113.215.6.77/c3pr90ntcya0/youku/6981496DC9913B8321BFE4A4E73/0300010E0C51F10D86F80703BAF2B1ADC67C80-E0F6-4FF8-B570-7DC5603F9F40.flv'
#     print(replace(r'<(?!img).*?>', "", html))
# #     print(replace(r'<img\s+src()>', "", html))

上一篇 下一篇

猜你喜欢

热点阅读