网页抓取保存到本地和解析

2018-11-21  本文已影响0人  AlastairYuan

网页抓取保存到本地

savedata_Chrome_byurl.py


from selenium import webdriver

import time

import io

import csv

import pymysql

import os

import re

from lxml import etree

import codecs

def savepage(browser, filepath, pagename):

    try:

      if not os.path.exists(filepath):

            os.mkdir(filepath)

        textContent = browser.find_element_by_xpath('//html').get_attribute('outerHTML')

        str_utf8 = textContent.encode("UTF-8")

        textContent = str_utf8.decode('UTF-8', 'strict')

        pagepath = filepath +'//'+ pagename + '.html'

        fp = open(pagepath, "w", encoding='UTF-8');

        fp.write(textContent);

        fp.close()

    except Exception as excpt:

        print(excpt)

def getDbConn(db):

    isonserver = True

    osname = os.name

    if osname == 'nt':

        isonserver = False

        print('windows')

    else:

        isonserver = True

        print(os.name)

    isonserver = False

    if isonserver:

        host = 'localhost'

        user = 'root'

        passwd = '123456'

    else:

        host = ''

        user = ''

        passwd = ''

    # db = 'couponcategory'

    port = 3306

    conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)

    return conn

def parse_data_page_step1(browser, url, urlname):

    print('doing.......')

    creditcard__items = browser.find_elements_by_xpath('//div[@class="creditcard__item"]')

    for creditcard__item in creditcard__items:

        try:

            # title = creditcard__item.find_element_by_xpath('.//h2[@class="creditcard__heading"]').get_attribute('textContent')

            article = creditcard__item.find_element_by_xpath('./article');

            href = article.find_element_by_xpath('./div[@class="compare"]').find_element_by_xpath('./div[last()]/a').get_attribute('href')

            # .get_attribute('href')

            item = {}

            item['url'] = url

            item['url2'] = href

            item['info0'] = urlname

            # item['info1'] = title

            print(urlname)

            print(url)

            print(href)

            stu1 = [url, href, urlname, '']

            out = open('fix10004.csv', 'a', newline='')

            # out = open('d:/data_source10004_v1.csv', 'a', newline='')

            # 设定写入模式

            csv_write = csv.writer(out, dialect='excel')

            # 写入具体内容

            csv_write.writerow(stu1)

            out.close()

        except Exception as aas:

            print(aas)

        # print('write item.............................................')

        # print(item)

        # dbname = 'brcardsdata'

        # dbtablename = 'data_source10004_url_v2'

        # updateToDatabase(dbname, dbtablename, item)

        # print('write item..............................................')

def get_key_url_map(dbname, tablename):

    conn = getDbConn(dbname)

    cursor = conn.cursor()

    print("mysql connect success")

    sql = "select url,pagecode from " + tablename

    cursor.execute(sql)

    dataresult = cursor.fetchall()

    conn.close()

    return dataresult

def scrapyStart1(browser, url, pagecode):

    # 返回一个

    # get_attribute('textContent')

    # get_attribute('innerHTML')

    # get_attribute('outerHTML')

    print('4')

    time.sleep(1)

    print('6')

    browser.get(url)

    print('7')

    time.sleep(5)

    print('8')

    try:

        savepage(browser, '10004', pagecode)

    except Exception as errr:

        print('........currpage....error......................')

        print(errr)

    try:

        targetElem = browser.find_element_by_xpath('//div[@class="pagehero__button"]')

        browser.execute_script("arguments[0].focus();", targetElem)

        time.sleep(0.5)

        targetElem.click()

        time.sleep(1.8)

        print('8')

        pagecode2 = pagecode + '_nextpage'

        savepage(browser, '10004', pagecode2)

    except Exception as eerr:

        print('........nextpage....error......................')

        print(eerr)

# re.sub(r'\?.*','',url)

browser = webdriver.Chrome()

time.sleep(0.5)

browser.maximize_window()

time.sleep(1)

key_url_map = get_key_url_map('pagedata', 'data_source10004_url')

# key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]

for key_url in key_url_map:

    url = key_url[0]

    pagecode = key_url[1]

    pagecode = str(pagecode)

    print(url)

    scrapyStart1(browser, url, pagecode)

time.sleep(100)

browser.close()

parsepagedata.py


from selenium import webdriver

import time

import io

import csv

import pymysql

import os

import re

from lxml import etree

from bs4 import BeautifulSoup

import numpy as np

import codecs

def etreeWebElemToOuterHtml(webitem):

    outerHtml = etree.tostring(webitem)

    outerHtml = outerHtml.decode('utf-8')

    return outerHtml

def trimDataHtmlProAndImg(divstr):

    divstr = re.sub(r' href=".*?"', "", divstr)

    divstr = re.sub(r' class=".*?"', "", divstr)

    divstr = re.sub(r' target=".*?"', "", divstr)

    divstr = re.sub(r' align=".*?"', "", divstr)

    divstr = re.sub(r' rel=".*?"', "", divstr)

    divstr = re.sub(r'<img.*?>', "", divstr)

    divstr = re.sub(r' data-cfemail=".*?"', "", divstr)

    divstr = re.sub(r' id=".*?"', "", divstr)

    divstr = re.sub(r' name=".*?"', "", divstr)

    divstr = re.sub(r' style=".*?"', "", divstr)

    divstr = re.sub(r' src=".*?"', "", divstr)

    divstr = re.sub(r' dir=".*?"', "", divstr)

    divstr = re.sub(r'<div .*?>', "<p>", divstr)

    divstr = re.sub(r'<strong .*?>', "<p>", divstr)

    divstr = re.sub(r'<a .*?</a>', "", divstr)

    divstr = re.sub(r'<p .*?>', "<p>", divstr)

    divstr = re.sub(r'<button .*?</button>', "", divstr)

    divstr = divstr.replace('<div>', '<p>')

    divstr = divstr.replace('<strong>', '<p>')

    divstr = divstr.replace('</div>', '</p>')

    divstr = divstr.replace('</strong>', '</p>')

    return divstr

def loadpage(filepath, pagename):

    try:

        pagepath = filepath + '//' + pagename + '.html'

        htmlf = open(pagepath,'r',encoding="utf-8")

        htmlContent = htmlf.read()

        return htmlContent

    except Exception as excpt:

        print(excpt)

    return ''

def parseWithBeautifulSoup(htmlContent):

    soup = BeautifulSoup(htmlContent, 'lxml')

    mululist = soup.find_all(class_='mulu')

def parseWithXpath(htmlContent):

    html = etree.HTML(htmlContent)

    mululist = html.xpath('.//*[@class="mulu"]')

def getDbConn(db):

    isonserver = True

    osname = os.name

    if osname == 'nt':

        isonserver = False

        print('windows')

    else:

        isonserver = True

        print(os.name)

    isonserver = False

    if isonserver:

        host = 'localhost'

        user = 'root'

        passwd = '123456'

    else:

        host = ''

        user = ''

        passwd = ''

    port = 3306

    conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)

    return conn

def updateToDatabase(dbname, tablename, item):

    url2 = item['url2']

    updatevalue = {'url2': url2}

    setsqllist = []

    collist = ['info0', 'info1', 'info2', 'info3', 'info4', 'info5', 'info6', 'info7', 'info8', 'info9', 'info10', 'url']

    for idx in range(len(collist)):

        colname = collist[idx]

        if colname in item:

            if item[colname]:

                updatevalue[colname] = item[colname]

                setsqllist.append(colname + '=%(' + colname + ')s')

    setsqllistlen = len(setsqllist)

    if setsqllistlen > 0:

        updatesql = 'update ' + tablename + ' set '

        setsqlliststr = ','.join(setsqllist)

        wherestr = ' where url2=%(url2)s'

        updatesql = updatesql + setsqlliststr + wherestr

        print(updatesql)

        # print(updatevalue)

        conn = getDbConn(dbname)

        cursor = conn.cursor()

        try:

            cursor.execute(updatesql, updatevalue)

        except Exception as e:

            print('Insert Error1', e)

            conn.rollback()

        else:

            conn.commit()

        conn.close()

def parse_data_page_step1(htmlContent, pageid):

    print('doing.......')

    html = etree.HTML(htmlContent)

    divcon = html.xpath('//div[@class="pagehero__content"]')[0]

    str1 = divcon.xpath('./div[@class="pagehero__wrapper"]/h1[@class="pagehero__heading"]')[0].text

    str2 = divcon.xpath('./div[@class="pagehero__wrapper"]/strong[@class="pagehero__description"]')[0].text

    item = {}

    item['url2'] = url

    item['info1'] = str1

    item['info8'] = str2

    print('write item.............................................')

    print(item)

    # dbname = 'brcardsdata'

    # dbtablename = 'data_source10004_url'

    # updateToDatabase(dbname, dbtablename, item)

    print('write item..............................................')

def parse_data_page_step2(htmlContent, pageid):

    print('doing.......')

    html = etree.HTML(htmlContent)

    itemlist= html.xpath('//div[@class="box--list"]/div[@class="box--list-item"]')

    info5 = ''

    info6 = ''

    info7 = ''

    info10 = ''

    for item in  itemlist:

        itemcon = item.xpath('./div[@class="box--container"]')[0]

        str1 = itemcon.xpath('./div[@class="box--header"]/h3')[0].text

        print(str1)

        itemconbody = itemcon.xpath('./div[@class="box--body"]')[0]

        str1 = str1.lower()

        str1 = str1.strip()

        # print(str1)

        if str1 == 'online':

            str2item = itemconbody.xpath('./div[contains(@class,"notsignedin")]')[0]

            str2 = etreeWebElemToOuterHtml(str2item)

            # print(str2)

            str2 = trimDataHtmlProAndImg(str2)

            str2 = str2.replace('<a></a>', '')

            info5 = '<p>' + str2 + '</p>'

            print('info5')

            print(info5)

        if str1 == 'no local':

            str2item = itemconbody

            str2 = etreeWebElemToOuterHtml(str2item)

            str2 = trimDataHtmlProAndImg(str2)

            info6 = '<p>' + str2 + '</p>'

        if str1 == 'por telefone':

            str2item = itemconbody

            str2 = etreeWebElemToOuterHtml(str2item)

            str2 = trimDataHtmlProAndImg(str2 )

            info7 = '<p>' + str2 + '</p>'

        if str1 == 'online':

            try:

                info10 = itemconbody.xpath('./div[contains(@class,"notsignedin")]').getAttributeValue('data-redirect') #申请链接

            except Exception as exx:

                print('....................errr1.......................')

                print(exx)

                try:

                    info10 = itemconbody.find_element_by_xpath('./div[contains(@class,"notsignedin")]/button').getAttributeValue('data-redirect')  # 申请链接

                except Exception as exx:

                    print('....................errr2.......................')

                    print(exx)

            info10 = 'https://www.foregon.com' + info10

    item = {}

    item['url2'] = url

    item['info5'] = info5

    item['info6'] = info6

    item['info7'] = info7

    item['info10'] = info10

    print('write item.............................................')

    print(item)

    # dbname = 'brcardsdata'

    # dbtablename = 'data_source10004_url'

    # updateToDatabase(dbname, dbtablename, item)

    print('write item.................................................')

def get_key_url_map(dbname, tablename):

    conn = getDbConn(dbname)

    cursor = conn.cursor()

    print("mysql connect success")

    sql = "select url,pagecode from " + tablename

    cursor.execute(sql)

    dataresult = cursor.fetchall()

    conn.close()

    return dataresult

def scrapyStart1(url, pagecode):

    htmlContent = loadpage('10004', pagecode)

    parse_data_page_step1(htmlContent, pagecode)

    pagecode2 = pagecode + '_nextpage'

    htmlContent = loadpage('10004', pagecode2)

    parse_data_page_step2(htmlContent, pagecode2)

# key_url_map = get_key_url_map('pagedata', 'data_source10004_url')

key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]

for key_url in key_url_map:

    url = key_url[0]

    pagecode = key_url[1]

    pagecode = str(pagecode)

    print(url)

    scrapyStart1(url, pagecode)

上一篇下一篇

猜你喜欢

热点阅读