Python爬虫之公司网站首页Title、Keywords、De

2019-12-29  本文已影响0人  PyGoZ

工作上涉及公司网站SEO优化相关的工作,但是在Title、Keywords、Description这块有点不清楚。于是,参考了一些与自己公司业务上有相同的公司网站内容,在进行词语分析选取一些关键词,在结合本公司的业务内容关键词进行综合,完成了网站SEO优化的Keywords

爬虫工程师

废话也不多说了,关于爬虫相关使用的教程在前面以及介绍了,而且还有栗子,这里主要贴上相关代码。除了爬取数据之外,本程序还把处理好的数据进行分析选取出现频率最多Keywords,以及把爬取的数据存储到excel中。

> 思路

抓取数据 -> 解析 -> 存储 -> 分析

> 特色

在本次爬虫中,运用了gevent协程,玩python不清楚协程的同学请自行脑补。

> 代码

# -*- coding: utf-8 -*-"""

------------------------------------------------

describe:

    用来抓取指定的物流公司官网的信息,包含title、keywords、description。

usage:

    python comp_infos_grab.py

base_info:

    __version__ = "v.10"

    __author__ = "PyGo"

    __time__ = "2019/12/3"

    __mail__ = "gaoming971366@163.com"

------------------------------------------------

"""

import requests

import gevent

import xlrd

import xlwt

from gevent import monkey; monkey.patch_all()

from bs4 import BeautifulSoup

import jieba

PUBLIC_URL_LIST = {

    "IML俄罗斯海外仓": "http://www.imlb2c.com/",

    "旺集科技": "http://www.wangjigroup.com/",

    "黑龙江俄速通国际物流有限公司": "http://www.ruston.cc/",

    "AliExpress全球速卖通": "https://sell.aliexpress.com/zh/__pc/shipping/aliexpress_shipping.htm",

    "中外运集装箱运输有限公司": "http://www.sinolines.com/",

    "乐泰国际物流有限公司": "http://www.letaimzl.com/",

    "NOEL诺艾尔集团": "http://www.noelworld.com/",

    "慧承国际物流": "http://www.hcwuliu.com/",

    "满洲里新颖国际货运代理有限公司": "http://www.mzlxinying.com/",

    "运盟国际物流": "http://www.ym-trans.com/",

    "如易科技": "http://www.ruecom.cn/"}

classcompanyGrap(object):    

    _instance = None    def__init__(self):        

        super(companyGrap, self).__init__()

    def__new__(cls, *args, **kwargs):       

         if companyGrap._instance is None:

            companyGrap._instance = object.__new__(cls, *args, **kwargs)

        return companyGrap._instance

    def_get_infos(self, url):       

        results = dict()

        results['url'] = url

        if not url:

            return results

        payload = ""        

headers = {

            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"            

}

        response = requests.get(url, data=payload, headers=headers)

        if response.status_code == 200:

            soup = BeautifulSoup(response.content, 'html.parser')

            head = soup.head

            titles = head.find_all('title')

            tl = titles[0].string if titles else ""            

            results['title'] = tl

            keywords = head.find_all('meta', attrs={'name': 'keywords'})

            kw = keywords[0].attrs.get('content') if keywords else ""            

            results['keyword'] = kw

            descriptions = head.find_all('meta', attrs={'name': 'description'})

            desc = descriptions[0].attrs.get('content') if descriptions else ""            

            results['description'] = desc

        return results

    def to_excel(self, datas, exlname):        

        """

        generate data of excel format to save

        :param datas: excel data

        :param exlname: excel name

        :return: None, excel data

        """       

        f = xlwt.Workbook(encoding='utf-8')

        sheet = f.add_sheet('sheet', cell_overwrite_ok=True)

        EXCEL_TITLES = ["ID", "NAME", "URL", 'TITLE', 'KEYWORDS', 'DESCRIPTION', "REMARK"]

        BUSINESS = "BUSINESS"        

        style_title = xlwt.XFStyle()

        font = xlwt.Font()

        font.name = 'Times New Roman'        

        font.bold = True        

        font.color_index = 4        

        font.height = 220        

        style_title.font = font

        style_content = xlwt.XFStyle()

        font = xlwt.Font()

        font.name = 'Times New Roman'        

        font.bold = False        

        font.color_index = 4        

        font.height = 220        

        style_content.font = font

        # 标题        

        for i in range(0, len(EXCEL_TITLES)):

            sheet.write(0, i, EXCEL_TITLES[i], style_title)

        # 合并 && 重写        

        sheet.write_merge(0, 0, 3, 5, BUSINESS, style_title)

        sheet.write_merge(0, 1, 0, 0, 'ID', style_title)

        sheet.write_merge(0, 1, 1, 1, 'NAME', style_title)

        sheet.write_merge(0, 1, 2, 2, 'URL', style_title)

        sheet.write_merge(0, 1, 6, 6, 'REMARK', style_title)

        for i in range(3, 6):

            sheet.write(1, i, EXCEL_TITLES[i], style_content)

        row = 2        

        count = 1        

        for line in datas:

            sheet.write(row, 0, count, style_title)

            sheet.write(row, 1, line.get('name'), style_content)

            sheet.write(row, 2, line.get('url'), style_content)

            sheet.write(row, 3, line.get('title'), style_content)

            sheet.write(row, 4, line.get('keyword'), style_content)

            sheet.write(row, 5, line.get('description'), style_content)

            row += 1            

           count += 1        

            f.save(exlname)

    def _deal_url(self, k, v):        

        return self._get_infos(v)

    def to_generate_kw(self, datas):        

        keywords_src = ""        

        for data in datas:

            if not data:

                continue            

        keywords_src += data.get('keyword')

        keywords = jieba.lcut(keywords_src, cut_all=False)

        counts = dict()

        for word in keywords:

            if not word:

                continue           

          if isinstance(word, unicode):

                word = word.encode('utf-8')

            if word in ('|', ',', ' ', '-', ','):

                continue            

            counts[word] = counts.get(word, 0) + 1        

            ord_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

        for k in ord_counts:

            print "%s: %s" % (k[0], k[1])

    defrun(self, to_excel=False):        """

        process run

        :param to_excel:

        :return:

        """        jobs = list()

        names = list()

        excel_datas = list()

        for k, v in PUBLIC_URL_LIST.iteritems():

            if not k or not v:

                continue            names.append(k)

            jobs.append(gevent.spawn(self._deal_url, k, v))

        gevent.joinall(jobs)

        for name, job in zip(names, jobs):

            value = job.value

            print '==================%s==================' % name

            print 'Title: %s' % value.get('title')

            print 'Keyword: %s' % value.get('keyword')

            print 'Description: %s' % value.get('description')

            value['name'] = name

            excel_datas.append(value)

        self.to_generate_kw(excel_datas)

        if to_excel:

            print '---------excel ok'            

            self.to_excel(excel_datas, 'companys.xls')

if __name__ == '__main__':

    companyGrap().run(to_excel=False)

上一篇下一篇

猜你喜欢

热点阅读