Go爬虫采用goroutine,chan 并发爬取百度首页新闻

2018-02-10  本文已影响0人  贼噶人
2018-02-10 12-38-40 的屏幕截图.png
 package main

import (
    "runtime"
    "sync"
    "bytes"
    "strconv"
    "strings"
    "net/http"
    "io/ioutil"
    "github.com/opesun/goquery"
    "fmt"
    _ "github.com/mattn/go-sqlite3"
    "database/sql"
    "os"
)


type news struct {
    title string
    url string
}



func init(){

}

/**
将爬取到的新闻保存到数据库
 */
func saveNewsDB(newsChan chan news,wSaveOk *sync.WaitGroup){
    db ,error := sql.Open("sqlite3","new.db")
    if nil != error {
        panic(error)
    }
    _,error = db.Exec("CREATE TABLE IF NOT EXISTS news(_id INTEGER PRIMARY KEY,title TEXT NOT NULL,url TEXT NOT NULL);")
    if nil != error {
        panic(error)
    }
    defer db.Close()
    for e := range newsChan {
        stmt,error := db.Prepare("INSERT INTO news (title,url) VALUES (?,?)")
        if nil != error {
            panic(error)
        }
        _,error = stmt.Exec(e.title,e.url)
        if nil != error{
            panic(error)
        }
        stmt.Close()
    }
    wSaveOk.Done()
}
var count int64

func getBaiduNews(offset chan int,newsChan chan news,wGetOk *sync.WaitGroup){
    for{
        index,ok := <-offset
        if !ok{
            break
        }
        var stringBuffer bytes.Buffer
        stringBuffer.WriteString("https://www.baidu.com/home/pcweb/data/mancardwater?id=2&offset=")
        stringBuffer.WriteString(strconv.Itoa(index))
        stringBuffer.WriteString("&sessionId=15180565112719&crids=&version=&pos=52&newsNum=52&blacklist_timestamp=0&indextype=manht&_req_seqid=0xab0aac7f0000ef5d&asyn=1&t=1518056583617&sid=1428_21082_20719")
        url := stringBuffer.String()
        payload := strings.NewReader("params=lKyJiYt65SeTS%252BaO4ZqdonwEyY%252BSzr7oFr8kEpP8j0H%252FqMMPELQv9UibydyQRz10kcdSQHQmgvca2yvKfGSX5R%252BU8ByLp6rS4CRiH%252B%252FYxok%253D%26&encSecKey=1dd9fd1745dae8af1c0a678baf62803becdabb6685b9cf756ce101accb9daa291f408a848e84d83a344fe98db6d3ea2abf63a278f98191c4234bd201a5bbfba1faadc509bacde313e693ecf0aceace909b5e8a168be20e34b0eef3640a45b075a6b4c1ff581cea91debaa69d125326e218d09bb01cc490ad09fe5c1d24746047")
        req, _ := http.NewRequest("GET", url, payload)
        req.Header.Add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
        req.Header.Add("accept-encoding", "gzip, deflate, br")
        req.Header.Add("accept-language", "zh-CN,zh;q=0.9,en;q=0.8")
        req.Header.Add("cache-control", "no-cache")
        req.Header.Add("connection", "keep-alive")
        req.Header.Add("cookie", "BDUSS=WoxNkZQYXdTfkdjWUd3YUVDNTIwTkV-NlpRRTYtbWhPOUEwZ356OGdyY0h4ak5aSVFBQUFBJCQAAAAAAAAAAAEAAACtM2Q2am92ZXpnAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAc5DFkHOQxZd; __cfduid=dd4f0bca4c50d4c7833592743d9934d741500540178; BAIDUID=43DFC2B31E07D893EC68CC4C31A5C7DC:FG=1; PSTM=1504776284; BIDUPSID=4F2127EE6005F11465294EE835520DE7; BD_UPN=123353; pgv_pvi=5320228864; BDRCVFR[e7VUaW6Ywr3]=aeXf-1x8UdYcs; BD_HOME=1; BD_CK_SAM=1; BDRCVFR[Oi7iajNidCC]=9xWipS8B-FspA7EnHc1QhPEUf; PSINO=5; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1428_21082_20719; sug=3; sugstore=1; ORIGIN=2; bdime=21110")
        req.Header.Add("host", "www.baidu.com")
        req.Header.Add("upgrade-insecure-requests", "1")
        req.Header.Add("user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/64.0.3282.119 Chrome/64.0.3282.119 Safari/537.36")
        req.Header.Add("postman-token", "0b307632-c6ae-bb88-6a78-cb196aa43b4e")
        res, error := http.DefaultClient.Do(req)
        if nil == error && 200 == res.StatusCode{
            bytes,e := ioutil.ReadAll(res.Body)
            if nil == e{
                info := string(bytes)
                if strings.Contains(info,"\"errNo\": \"0\""){
                    info := strings.Split(strings.
                        Split(info,"\",\"isEnd\": \"0\",")[0],"{\"errNo\": \"0\",\"html\" : \"")[1]
                    nodes,parseError := goquery.Parse(strings.NewReader(info))
                    if nil == parseError {
                        nodes.Find("a.s-title-yahei").Each(func(index int, element *goquery.Node) {
                            var new news
                            for _,v := range element.Node.Attr {
                                if v.Key == "data-title" {
                                        new.title = v.Val
                                    }
                                    if v.Key == "data-link" {
                                        new.url = v.Val
                                    }
                            }
                            newsChan <- new
                            fmt.Println(new)
                        })
                    }
                }
            }
        }
        res.Body.Close()
    }
    wGetOk.Done()
}

func main() {
    fmt.Println(os.Args)
    var count int
    
    if len(os.Args) > 1 {
       size,err := strconv.ParseInt(os.Args[1],0,64)
       if nil != err {
           panic(err)
       }
       count = int(size)
    }
    if 0 == count {
        count = 10
    }
    
    
    index := make(chan int)
    newsChan := make(chan news,100)
    var wGetOk sync.WaitGroup
    wGetOk.Add(runtime.NumCPU())
    for m := 0; m < runtime.NumCPU() ;m++  {
        go getBaiduNews(index,newsChan,&wGetOk)
    }
    var wSaveOk sync.WaitGroup
    wSaveOk.Add(1)
    go saveNewsDB(newsChan,&wSaveOk)
    go func() {
        defer close(newsChan)
        wGetOk.Wait()
    }()
    
    
    go func() {
        defer close(index)
        for i := 1;i < count ;i++  {
            index <- i
        }
    }()
    wSaveOk.Wait()
}

上一篇 下一篇

猜你喜欢

热点阅读