文档解析工具
2020-05-05 本文已影响0人
LCSan
#coding=utf-8
'''
Created on 2019年3月2日
@author: 瞌睡蟲子
'''
import lxml.html as dom
from cssselect import GenericTranslator
import json, jsonpath, re, js2py
def initJson(data):
global jsn
if type(data) == dict:
jsn = data
else:
jsn = json.load(data)
def jpath(exp, data=None):
if data != None:
if type(data) == dict:
js = data
else:
js = json.loads(data)
else:
js = jsn
return jsonpath.jsonpath(js, exp)
def initDom(html):
global docm
try:
docm = dom.document_fromstring(html)
except Exception as e:
print("Error:" + str(e))
docm = dom.document_fromstring(html.encode())
def xpath(exp, html=None):
if html != None:
try:
dm = dom.document_fromstring(html)
except Exception as e:
print("Error:" + str(e))
dm = dom.document_fromstring(html.encode())
else:
dm = docm
elem = dm.xpath(exp)
if len(elem) > 0 and type(elem[0]) == dom.HtmlElement:
return [dom.tostring(item,encoding = "utf-8").decode() for item in elem]
else:
return [str(item) for item in elem]
def css(exp, html=None):
return xpath(GenericTranslator().css_to_xpath(exp), html)
def replace(reg, exp, come):
# re.MULTILINE | re.DOTALL
return re.sub(re.compile(reg, 8 | 16), (exp if exp.find('lambda') == -1 else eval(exp, globals(), locals())), come)
def pyeval(exp, args=None):
return eval(exp, globals(), locals())
def createJsEval(param={}):
return js2py.EvalJs(param)
def jsExecute(evjs, js=None, use=False):
return evjs.execute(js, use)
def jsEval(evjs, expression, use=False):
return evjs.eval(expression, use)
def callJsFun(evjs, fun=None, args=None):
if fun:
if hasattr(evjs,fun):
if args is not None:
return getattr(evjs, fun)(args)
else:
return getattr(evjs, fun)
else:
raise Exception("调用方法不存在")
else:
if args is not None:
return evjs(args)
else:
return evjs()
def callJsProperty(evjs, attr=None):
if property is None:
return evjs
else:
return eval("evjs." + attr, globals(), locals())
if __name__ == "__main__":
f = open("C:\\Users\\Administrator\\Desktop\\text.xml", encoding='UTF-8') #打开文件
fr = f.read() #读取文件
initDom(fr)
a = xpath("//poi",fr)
print(a)
for i in a:
print(i)
j = xpath("//name",i)
print(j)
print(xpath("//name//text()",j[0]))
print(xpath("//pname/text()"))
# html = '''<ul>
#
# <li class="menuItemList">
# <a id="CatList_LinkList_0_Link_0" href="https://www.cnblogs.com/hiwuchong/category/1132709.html">Python编程(43)</a>
# <span style="width:10px">
# 啊啊啊啊
# </span>
# </li>
#
# <li class="menuItemList">
# <a id="CatList_LinkList_0_Link_1" href="https://www.cnblogs.com/hiwuchong/category/1003142.html">R语言实战(3)</a>
# <span style="width:10px">
# 大幅度发
# </span>
# </li>
#
# <li class="menuItemList">
# <a id="CatList_LinkList_0_Link_2" href="https://www.cnblogs.com/hiwuchong/category/1003145.html">SQL SERVER(4)</a>
# <img src="aaaaaaaaaa"></img>
# <span style="width:10px">
# 大多数
# </span>
# </li>
#
# </ul>
# '''
# # initDom(html)
# # print(xpath("//li/a/text()",html))
# # initDom(html)
# # print(css("li a"))
# # initJson({"a":"123","b":123,"c":"啊哈","d":{"e":"aa"},"f":[{"a":"ad"},{"g":"df","m":"dfd"}]})
# # print(jpath("$..f","{\"a\":1,\"f\":2}"))
# # print(jpath("$..f[*]"))
# # print(pyeval("[item+'aa' for item in args]", ["a","b","c"]))
#
# # url = 'https://113.215.20.136:9011/113.215.6.77/c3pr90ntcya0/youku/6981496DC9913B8321BFE4A4E73/0300010E0C51F10D86F80703BAF2B1ADC67C80-E0F6-4FF8-B570-7DC5603F9F40.flv'
# print(replace(r'<(?!img).*?>', "", html))
# # print(replace(r'<img\s+src()>', "", html))