内涵段子、糗事百科网页抓取分析

2021-12-06  本文已影响0人  海阔天空的博客

简介:

最近写了一个小工具,用来抓取内涵段子、糗事百科等各种笑话网站的段子和图片,最后保存文本,并发布在微信公众号上。使用谷歌的 v8 做了一个脚本引擎,使用 c++ 的实现了笑话的统计和发布功能,用js实现了网页爬取分析的功能。这样 c++ 调用 v8 引擎,加载 js 脚本,就会爬取一系列的内容。

以下是网页爬取分析的内容,当然js的实现只是思路,用其他语言也是一样能实现。抓取的内容有:文章、图片地址、点赞数。

内涵段子网页抓取分析代码:

//内涵段子
//http://neihanshequ.com/
var webUrl = 'http://neihanshequ.com/';
var imageUrl = 'http://neihanshequ.com/pic/';
var index = 1;
var endIndex = 5;
var retVal =
{
    success: false,
    items: []
};
 
function getJoyFromOnePage(htmlData, requestParams)
{
    var nCount = 0;
    var bEndOnePage = false;
    while(!bEndOnePage)
    {
        var result = 
        {
            webname: 'NeiHanDuanzi',
            webid: '',
            type: '',
            context: '',
            pic_url: '',
            read_count: '',
            publish_time: '',
            best_comment: ''       
        }
         
        //check gif
        {
            var keyWords = '"is_gif":"';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf('"');
                var gif = htmlData.substring(0, endIndex);
                if( gif == 1 )
                {
                    alert('NeiHanDuanzi:url is a gif:' + result.pic_url);
                    continue;
                }
            }
        }
         
        //webid
        {
            var keyWords = 'data-group-id="';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf('"');
                result.webid = htmlData.substring(0, endIndex);
                alert('NeiHanDuanzi:webid-' + result.webid);
            }
            else
            {
                bEndOnePage = true;
                alert('NeiHanDuanzi:webid not find, page end.');
            }
        }
         
        //read_count
        {
            var keyWords = '<span class="digg">';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf('</span>');
                result.read_count = htmlData.substring(0, endIndex);
                //alert('NeiHanDuanzi:read_count-' + result.read_count);
            }
        }
         
        //context
        {
            var keyWords = 'data-text="';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf('"');
                result.context = htmlData.substring(0, endIndex);
                //alert('NeiHanDuanzi:context-' + result.context);
            }
        }
         
        //pic_url
        {
            var keyWords = 'data-pic="';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf('"');
                result.pic_url = htmlData.substring(0, endIndex);
            }
        }
         
        /*
        nCount++;
        if( nCount >= 20 )
        {
            break;
        }
        */
         
        retVal.items.push(result);
    }
     
    return;
}
 
function getJoyContextList( url, parametersString )
{
    var parameters = eval("(" + parametersString + ")");
    var requestParams =
    {
        method: 'GET',
        version: 'HTTP/1.1',
        headers: {},
        scriptParamaters: parameters
    };
     
    requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
    requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
    requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
    requestParams.headers['Cache-Control'] = 'no-cache';
    requestParams.headers['Connection'] = 'keep-alive';
    requestParams.headers['Cookie'] = 'uuid="w:0ef44d961a6d43c99dd81ecb51596731"; sessionid=57f633c63c5de5d0bc03cddb0c6ee166; tt_webid=5286193655; __utmt=1; csrftoken=d760789fbe1fc31edae4ac6c11c5a700; Hm_lvt_773f1a5aa45c642cf87eef671e4d3f6a=1438825221,1438939411,1440988068,1440996740; Hm_lpvt_773f1a5aa45c642cf87eef671e4d3f6a=1440996782; __utma=101886750.2017161997.1438825217.1440995644.1440996740.6; __utmb=101886750.5.10.1440996740; __utmc=101886750; __utmz=101886750.1440996740.6.4.utmcsr=haosou.com|utmccn=(organic)|utmcmd=organic|utmctr={b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}86{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}85{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E7{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}A4{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BE{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}8C{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BA';
    requestParams.headers['Host'] = 'neihanshequ.com';
    requestParams.headers['Pragma'] = 'no-cache';
    requestParams.headers['Referer'] = 'http://neihanshequ.com/';
    requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
 
    {
        //get web context
        var httpRspString = syncHttpRequest(webUrl, JSON.stringify(requestParams));
        var httpRsp = eval("(" + httpRspString + ")");
        if( !httpRsp || httpRsp.statusCode != 200 )
        {
            alert('NeiHanDuanzi: Request webUrl(' + webUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
            return JSON.stringify(retVal);
        }
         
        var htmlData = httpRsp.data;
        getJoyFromOnePage(htmlData, requestParams)
    }
     
    {
        //get image
        var httpRspString = syncHttpRequest(imageUrl, JSON.stringify(requestParams));
        var httpRsp = eval("(" + httpRspString + ")");
        if( !httpRsp || httpRsp.statusCode != 200 )
        {
            alert('NeiHanDuanzi: Request imageUrl(' + imageUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
            return JSON.stringify(retVal);
        }
         
        var htmlData = httpRsp.data;
        getJoyFromOnePage(htmlData, requestParams)
         
        //alert('NeiHanDuanzi: headers' + httpRsp.data);
    }
 
     
    retVal.success = true;
    return JSON.stringify(retVal);
}

糗事百科网页抓取分析代码

//糗事百科
//http://www.qiushibaike.com/hot/page/{index}
var webUrl = 'http://www.qiushibaike.com/hot/page/';
var index = 1;
var endIndex = 5;
var retVal =
{
    success: false,
    items: []
};
 
function getJoyFromOnePage(htmlData)
{
    var bEndOnePage = false;
    while(!bEndOnePage)
    {
        var result = 
        {
            webname: 'QiuShiBaiKe',
            webid: '',
            type: '',
            context: '',
            pic_url: '',
            read_count: '',
            publish_time: '',
            best_comment: ''       
        }
        //webid
        {
            var keyWords = 'qiushi_tag_';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf("'>");
                result.webid = htmlData.substring(0, endIndex);
                alert('QiuShiBaiKe:webid-' + result.webid);
            }
            else
            {
                bEndOnePage = true;
                alert('QiuShiBaiKe:webid not find, page end.');
            }
        }
        //context
        {
            var keyWords = '<div class="content">';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf("<!");
                result.context = htmlData.substring(0, endIndex);
                //alert('QiuShiBaiKe:context-' + result.context);
            }
        }
         
        //pic_url
        {
            var keyWords = '<a href="/article/' + result.webid + '" target="_blank">';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                 
                //get sub
                keyWords = '<img src="';
                startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf('" alt="');
                    result.pic_url = htmlData.substring(0, endIndex);
                    //alert('QiuShiBaiKe:pic_url-' + result.pic_url);
                }
            }
        }
         
        //read_count
        {
            var keyWords = '<span class="stats-vote"><i class="number">';
            var startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf("</i>");
                result.read_count = htmlData.substring(0, endIndex);
                //alert('QiuShiBaiKe:read_count-' + result.read_count);
            }
        }
         
        retVal.items.push(result);
    }
     
    return;
}
 
function getJoyContextList( url, parametersString )
{
    var parameters = eval("(" + parametersString + ")");
    var requestParams =
    {
        method: 'GET',
        version: 'HTTP/1.1',
        headers: {},
        scriptParamaters: parameters
    };
    requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
    requestParams.headers['Host'] = 'www.qiushibaike.com';
    requestParams.headers['Connection'] = 'keep-alive';
    requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
    requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
    requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
     
    for(var i = index; index <= endIndex; index++)
    {
        var trueUrl = webUrl + index;
        var httpRspString = syncHttpRequest(trueUrl, JSON.stringify(requestParams));
        var httpRsp = eval("(" + httpRspString + ")");
        if( !httpRsp || httpRsp.statusCode != 200 )
        {
            alert('QiuShiBaiKe: Request trueUrl(' + trueUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
            return JSON.stringify(retVal);
        }
         
        var htmlData = httpRsp.data;
        getJoyFromOnePage(htmlData)
    }
     
    retVal.success = true;
    return JSON.stringify(retVal);
}

本文摘录于海阔天空的博客,作者: zjg555543,发布时间: 2015-09-17

上一篇下一篇

猜你喜欢

热点阅读