0008-HTML提出出正文

2017-08-17  本文已影响0人  gogoforit

代码

import requests
import re


def request_get(url):
    response = requests.get(url)
    response.encoding = 'gb2312'   # 中文乱码处理
    text = response.text
    re_handle(text)


def re_handle(text):
    text = re.findall(r'<div id="twgg" class="gg">.*<p>\s+(.*?)</p>', text, re.S)[0]  # 跨行 re.S
    text = re.sub('<.*?>', '', text)
    text = re.sub('\s+', '', text)
    print_text(text)


def print_text(text):
    print(text)

if __name__ == '__main__':
    url = 'http://www.jjcom/jjart/412827.html'
    request_get(url)

新知识

requests.get中文乱码,指定编码格式

response = requests.get(url)
response.encoding = 'gb2312'
text = response.text

正则跨行匹配,要加re.S

text = re.findall(r'<div id="twgg" class="gg">.*<p>\s+(.*?)</p>', text, re.S)[0]  # 跨行 re.S
上一篇 下一篇

猜你喜欢

热点阅读