5、Python beautifulsoup库, 简单调用
2019-03-08 本文已影响0人
波罗的海de夏天
from bs4 import BeautifulSoup
html_string = """
<!DOCTYPE html>
<head>
<title id='title'>天下文章</title>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=Edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0,user-scalable=no">
<meta name="mobile-agent" content="format=html5;url=https://www.jianshu.com/">
</head>
<body lang="zh-CN" class="reader-black-font">
<nav class="navbar navbar-default navbar-fixed-top" role="navigation">
<h3 class="wine" href="https://www.baidu.com/" style='font-size:12px;line-height:28px;margin: 8px 0;'>喝孔府宴酒 做天下文章</h3>
<a class="logo" href="/"><img src="//cdn2.jianshu.io/assets/web/nav-logo-4c7bbafe27adc892f3046e6978459bac.png" alt="Nav logo" />写文章</a>
<a class="btn write-btn" target="_blank" href="/writer#/" style='font-size:10px;line-height:48px;margin: 18px 0;'><i class="iconfont ic-write"></i>写文章</a>
</nav>
</body>
</html>
"""
'''声明Beautifulsoup对象,html传入解析器lxml'''
soup = BeautifulSoup(html_string, 'lxml')
# print(soup)
# print('---'*20)
# 代码格式化
# print(soup.prettify())
# print('---'*20)
'''标签选择器'''
# 选择元素
print(soup.title)
print(soup.h3)
print('---'*20)
# 获取名称
print(soup.title.name)
print(soup.h3.name)
print('---'*20)
# 获取属性
print(soup.title.attrs)
print(soup.h3.attrs)
print(soup.h3.attrs['class'])
print('---'*20)
# 获取内容 (string与text type不同)
print('<title> string:', soup.title.string, 'type:', type(soup.title.string))
print('<title> text:', soup.title.text, 'type:', type(soup.title.text))
print('<h3> string:', soup.h3.string, 'type:', type(soup.h3.string))
print('<h3> text:', soup.h3.text, 'type:', type(soup.h3.text))
print('---'*20)
# 嵌套选择
print(soup.head.title)
print(soup.body.nav.h3)
print('---'*20)
# 获取子节点 方法1
print(soup.head.contents)
print(soup.nav.contents)
print('---'*20)
# 获取子节点 方法2 (.childern是一个迭代器)
print(soup.head.children)
for child in soup.head.children:
print(child)
print('---' * 20)
print(soup.nav.children)
for i, child in enumerate(soup.nav.children):
print(i, child)
print('---' * 20)
# 验证换行符
item0 = list(soup.nav.children)[0]
print('bb:', item0)
if item0 == '\n':
print(True)
else:
print(False)
print('---' * 20)
# 获取子节点 方法3
print(soup.head.descendants)
for i, child in enumerate(soup.head.descendants):
print(i, child)
print('---' * 20)
# 获取父节点
print(soup.head.parents)
for i, parent in enumerate(soup.head.parents):
print(i, parent)
print('---' * 20)
# 获取兄弟节点
print(soup.head.next_siblings)
for i, next in enumerate(soup.head.next_siblings):
print(i, next)
print('---' * 20)
print(soup.body.previous_siblings)
for i, previous in enumerate(soup.body.previous_siblings):
print(i, previous)
print('---' * 20)
'''
标准选择器
find_all(name,attrs,recursive,text,**kwargs) 可根据标签名、属性、内容、文档查找内容
'''
# name
print(soup.find_all('a')) # <a>
print('---' * 20)
print(soup.find_all('a')[0])
print('---' * 20)
for item in soup.find_all('a'):
print(item)
print('---' * 20)
# attrs
print(soup.find_all(attrs={'id': 'title'}))
print(soup.find_all(attrs={'class': 'wine'}))
print('---' * 20)
# id 和 class_
print(soup.find_all(id = 'title'))
print(soup.find_all(class_='wine'))
print('---' * 20)
# 文本内容text
print(soup.find_all(text='天下文章'))
print(soup.find_all(text='喝孔府宴酒 做天下文章'))
print('---' * 20)
# find方法
'''find 返回单个元素,find_all返回所有元素'''
print(soup.find('a'))
print(soup.find(attrs={'id': 'title'}))
print(soup.find(class_='wine'))
print(soup.find(text='写文章'))
print('---' * 20)
'''CSS选择器 -- 通过select直接传入CSS选择器就可以完成选择'''
# CSS选择器
print(soup.select('.wine')) # class
print(soup.select('#title')) # id
print(soup.select('nav a'))
print(type(soup.select('a')[0]), soup.select('a')[0])
print('---' * 20)
# CSS选择器 迭代
print(soup.select('nav'))
for item in soup.select('nav'):
print(item.select('.wine'))
print('---' * 20)
# CSS选择器 属性
# print(soup.select('nav'))
for item in soup.select('nav'):
print(item['class'])
print('---' * 20)
# CSS选择器 内容
for item in soup.select('a'):
print(item.get_text())