Python爬虫(5)-存储数据
2018-11-11 本文已影响9人
初灬终
环境:Python3.6
工具:PyCharm
目录:Menu
目标:将采集的数据存储在本地
1.普通文件的存储(.txt, .jpg等)
例如:下载某图片网页上的所有图片
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import re
import os
html = urlopen("http://www.27270.com/tag/1756.html")
bsObj = BeautifulSoup(html, "lxml")
images = bsObj.findAll("img",{"src": re.compile("http\:\/\/t1\.27270\.com\/uploads\/tu(/\w+){3}\.jpg")})
des = "/tmp/27270image/"
if not os.path.isdir(des):
os.mkdir(des)
for (index,image) in enumerate(images):
print(image["src"])
urlretrieve(image["src"], des + "%d.jpg" % index)
print("total download %d images" % len(images))
查看存储目录,图片已下载。
2.数据库存储
第一步 创建数据库,创建表(这里database是python,table是beauty)
CREATE DATABASE python;
CREATE TABLE beauty;
CREATE TABLE `beauty` (
`id` int unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(200),
`image_url` varchar(2000),
`created` timestamp DEFAULT CURRENT_TIMESTAMP,
`height` int NOT NULL,
`width` int NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
第二步 抓去数据,存入数据库(主要是抓去title,image_url,height,width这4个字段信息)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
conn = pymysql.connect(host="localhost", user="root", password="root", db="python", charset="utf8")
cur = conn.cursor()
cur.execute("USE python")
#存入mysql
def store(title, image_url, width, height):
print("store(%s,%s,%s,%s)" % (title, image_url, width, height))
cur.execute("INSERT INTO beauty (title, image_url, width, height) VALUES (\"%s\", \"%s\", %s, %s)", (title, image_url, width, height))
cur.connection.commit()
html = urlopen("http://www.27270.com/tag/1756.html")
bsObj = BeautifulSoup(html, "lxml")
list = bsObj.find("ul", {"id":"Tag_list"}).children
title = ""
image_url = ""
width = 0
height = 0
for li in list:
# find找到的内容类型,可能是Tag对象,也可能是NavigableString.
if not isinstance(li, str):
a = li.find("a")
if not isinstance(a, str):
if "title" in a.attrs:
title = a.attrs["title"]
img = a.find("img")
if not isinstance(img, str):
if "width" in img.attrs:
width = int(img.attrs["width"])
if "height" in img.attrs:
height = int(img.attrs["height"])
if "src" in img.attrs:
image_url = img.attrs["src"]
store(title, image_url, width, height)
cur.close()
conn.close()
查看结果
image.png