Python爬虫(5)-存储数据

2018-11-11  本文已影响9人  初灬终

环境:Python3.6
工具:PyCharm
目录:Menu
目标:将采集的数据存储在本地

1.普通文件的存储(.txt, .jpg等)

例如:下载某图片网页上的所有图片

from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import re
import os

html = urlopen("http://www.27270.com/tag/1756.html")
bsObj = BeautifulSoup(html, "lxml")
images = bsObj.findAll("img",{"src": re.compile("http\:\/\/t1\.27270\.com\/uploads\/tu(/\w+){3}\.jpg")})

des = "/tmp/27270image/"
if not os.path.isdir(des):
    os.mkdir(des)

for (index,image) in enumerate(images):
    print(image["src"])
    urlretrieve(image["src"], des + "%d.jpg" % index)
print("total download %d images" % len(images))

查看存储目录,图片已下载。

2.数据库存储

第一步 创建数据库,创建表(这里database是python,table是beauty)

CREATE DATABASE python;
CREATE TABLE beauty;
CREATE TABLE `beauty` (
  `id` int unsigned NOT NULL AUTO_INCREMENT,
  `title` varchar(200),
  `image_url` varchar(2000),
  `created` timestamp DEFAULT CURRENT_TIMESTAMP,
  `height` int NOT NULL,
  `width` int NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

第二步 抓去数据,存入数据库(主要是抓去title,image_url,height,width这4个字段信息)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

conn = pymysql.connect(host="localhost", user="root", password="root", db="python", charset="utf8")
cur = conn.cursor()
cur.execute("USE python")


#存入mysql
def store(title, image_url, width, height):
    print("store(%s,%s,%s,%s)" % (title, image_url, width, height))
    cur.execute("INSERT INTO beauty (title, image_url, width, height) VALUES (\"%s\", \"%s\", %s, %s)", (title, image_url, width, height))
    cur.connection.commit()

html = urlopen("http://www.27270.com/tag/1756.html")
bsObj = BeautifulSoup(html, "lxml")
list = bsObj.find("ul", {"id":"Tag_list"}).children
title = ""
image_url = ""
width = 0
height = 0
for li in list:
    # find找到的内容类型,可能是Tag对象,也可能是NavigableString.
    if not isinstance(li, str):
        a = li.find("a")
        if not isinstance(a, str):
            if "title" in a.attrs:
                title = a.attrs["title"]
            img = a.find("img")
            if not isinstance(img, str):
                if "width" in img.attrs:
                    width = int(img.attrs["width"])
                if "height" in img.attrs:
                    height = int(img.attrs["height"])
                if "src" in img.attrs:
                    image_url = img.attrs["src"]

                store(title, image_url, width, height)

cur.close()
conn.close()

查看结果


image.png
上一篇下一篇

猜你喜欢

热点阅读