python 爬取github框架

2019-05-13  本文已影响0人  cyh老崔

前言

做iOS马甲包时, 需要添加混淆代码, 一部分便来自github, 抽空写了个脚本, 使用的scrapy框架

一, Item 模型

import scrapy


class GithubItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field() # 框架名称
    url = scrapy.Field()  # 框架主页路径
    star_number = scrapy.Field() # 框架的star数量
    update_time = scrapy.Field() # 更新时间
    clone_url = scrapy.Field()  # clone地址

二, 爬取框架要求

from github.items import GithubItem
import scrapy


class GithubSpider(scrapy.Spider):
    name = 'github'
    allowed_domains = ['github.com']
    url = 'https://github.com/search?l=Objective-C&o=desc&q=ios&s=stars&p='
    offset = 1
    start_urls = [
        url+str(offset)
    ]
    # 所有的模型
    items = []
    # 保存clone_url
    clone_urls = []

    def parse(self, response):
        # parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract()

        names = response.xpath('//ul[@class=\"repo-list\"]/li/div/h3/a/text()').extract()
        # 访问url全路径,前面需要拼接:https://github.com
        urls = response.xpath('//ul[@class=\"repo-list\"]/li/div/h3/a/@href').extract()
        star_numbers:list = response.xpath('//ul[@class=\"repo-list\"]/li/div//a[@class=\"muted-link\"]/text()').extract()

        # 去掉无效值及空白回车
        for i in range(len(star_numbers)-1, -1, -1):
            temp:str = star_numbers[i].strip()
            if len(temp) > 2 or temp.find('k')!=-1:
                star_numbers[i] = temp

        update_times = response.xpath('//ul[@class=\"repo-list\"]/li/div//p[@class=\"f6 text-gray mr-3 mb-0 mt-2\"]/relative-time/text()').extract()
        for i in range(0, len(names)):
            item = GithubItem()

            item['name'] = names[i]
            item['url'] = 'https://github.com' + urls[i]
            item['star_number'] = star_numbers[i]
            item['update_time'] = update_times[i]
            self.items.append(item)

        for item in self.items:
            yield scrapy.Request(url=item['url'], meta={'item':item}, callback=self.parse_article)

        if self.offset < 3:
            self.offset += 1

        # 每次处理完一页的数据之后,重新发送下一页页面请求
        yield scrapy.Request(self.url+str(self.offset), callback=self.parse)

    def parse_article(self, response):
        """
        解析框架主页,提取其中的clone地址
        :param response:
        :return:
        """
        item = response.meta['item']
        item['clone_url'] = response.xpath('//div[@class=\"input-group\"]/input/@value').extract()
        self.clone_urls.append(item['url'])
        print('clone_url: %s'%self.clone_urls)
        # print(item)

注: 修改此处控制拉取数量:

        if self.offset < 3:
            self.offset += 1

三, clone到指定本地路径

import os
import pwd
import shutil

class GitUtil(object):

    @classmethod
    def yh_clone_url(cls, url):

        # 确定路径
        dst_path = cls._dst_path_of_clone(url)
        if os.path.exists(dst_path):
            shutil.rmtree(dst_path)

        os.mkdir(dst_path)

        # 执行shell
        cls._clone_url_to_dst_path(url, dst_path)
        output = os.popen("pwd")
        print (output.read())

    @classmethod
    def _clone_url_to_dst_path(cls, url, dst_path):
        """
        clone到目的路径
        :param url: 将要clone的仓库
        :param dst_path: 目的路径
        :return:
        """
        os.system("git clone " + url + " " + dst_path)


    @classmethod
    def _dst_path_of_clone(cls, url):
        # 确定路径, 得到框架名称
        last_path = os.path.split(url)[-1]
        framework_name = os.path.splitext(last_path)[0]

        home_path = pwd.getpwuid(os.getuid()).pw_dir
        dst_folder_path = os.path.join(home_path, 'Desktop/tmp_git')
        if os.path.exists(dst_folder_path):
            shutil.rmtree(dst_folder_path)
        os.mkdir(dst_folder_path)
        dst_path = os.path.join(dst_folder_path, framework_name)

        return dst_path

四, main.py 执行此脚本

from scrapy import cmdline
cmdline.execute('scrapy crawl github'.split())

源码链接: https://github.com/cuiYuhe/crawlFrameworks

上一篇 下一篇

猜你喜欢

热点阅读