只用re正则解析html

2021-10-08  本文已影响0人  是东东

目前功能用于解析div、div内的a标签 (更新中.......)

# -*- coding:utf-8 -*-
import re


def get_div_ele(pattern, any_str):
    ll = []
    tag = re.findall('//([a-z]+)\[', pattern)[0]
    ele = re.findall('@(\w+)?=', pattern)[0]
    va = re.findall('[",\'](.*)?[",\']', pattern)[0]
    if_continue = f'{ele}="{va}"'
    # print(tag)
    # print(ele)
    # print(va)
    # print(if_continue)
    if tag == 'a':
        pattern = f'{ele}="{va}".*?</{tag}>'
        pattern_1 = f'>(.*?)</{tag}>'
        _a = re.findall(pattern, any_str, re.S)
        if _a:
            for v in _a:
                _d, _t = {}, {}
                href = re.findall('href="(.*?)"', v)
                text = re.findall(pattern_1, v)
                texts = re.findall('>(.*?)<', v)
                print(texts)
                _d['href'] = href
                _t['texts'] = texts
                _t['ele'] = text
                _d['text'] = _t
                ll.append(_d)
        else:
            print(f'{tag} pattern没有匹配到')
    elif tag == 'div':
        pattern = f'<{tag} {ele}="{va}">.*</{tag}>'
        pattern_1 = pattern.split('>.*')[0]
        try:
            div = re.findall(pattern, any_str, re.S)[0]
            div = div.replace('\\\n', '').replace('\\\t', '').replace('\\\r', '').replace('\n', '').replace('\t', '') \
                .replace('\r', '').replace('  ', '').replace('  ', '').replace('  ', '').replace('  ', '').replace('  ',
                                                                                                                   '')
            x1 = re.search('<(\w+)', pattern).group(0)
            x2 = re.search('</\w+>', pattern).group(0)
            x3 = f'{x1} {if_continue}'
            div1 = div.split(x3)
            n = 1
            for text in div1:
                if text == '':
                    continue
                text = f'{pattern_1}{text}'
                dd = text.split(x2)
                d1 = x2.join(dd[:n])
                c1 = d1.count(x1)
                ddd = x2.join(dd[:c1 + 1])
                c2 = ddd.count(x2)
                if c1 == c2:
                    ll.append(ddd)
        except IndexError:
            print(f'{tag} pattern没有匹配到')

    return ll


result = {}
with open('baidu_ad.html', 'r', encoding='utf-8') as rr:
    text = rr.read()
ad = get_div_ele('//div[@class="_3te7bpt f13 c-gap-top-xsmall"]', text)
for _ in ad:
    _d = {}
    ad = get_div_ele('//a[@class="c-showurl c-color-gray"]', _)
    _d['result'] = _
    _d['urls'] = ad
    result['data'] = _d
print(result)
上一篇 下一篇

猜你喜欢

热点阅读