爬虫第一章

2016-11-18 本文已影响0人王小坤_GO

from bs4 import BeautifulSoup

path='./1_2_homework_required/index.html'

with open(path,'r')as f:

soup=BeautifulSoup(f.read(),'lxml')

images=soup.select('div > div > div > div > div > div > img')

titles=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')

prices=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')

words=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > p')

stars=soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p > span')

starslast=[]

while len(stars)!=0:

fenzu=[]

for i in range(0,5):

if 'glyphicon glyphicon-star-empty' in str(stars[i]):

fenzu.insert( 1, '☆')

else:

fenzu.insert(1, '★')

del stars[0:5]

starslast.append(fenzu)

print(starslast)

for image,title,price,word,star in zip(images,titles,prices,words,starslast):

info={

'image':image.get('src'),

'title': title.get_text(),

'price': price.get_text(),

'word': word.get_text(),

'star': star

}

print(info)

总结：

（1）beautifulsoup 不支持nth-child，去掉css中的nth-child语法

（2）图片链接放在了标签的src属性里面，我们可以利用get()函数得到属性文本里面的内容image_content=image.get(src")