Go语言基础08——HTTP编程

2019-03-20  本文已影响0人  Spring618

请求报文格式分析

package main

import (
    "fmt"
    "net"
)

func main() {
    fmt.Println("Http请求包格式演示案例")
    listener, err := net.Listen("tcp", "127.0.0.1:8000") //监听
    if err != nil {
        fmt.Println("err:", err)
        return
    }

    defer listener.Close() //关闭

    for {
        //阻塞,等待用户连接
        conn, err := listener.Accept()
        if err != nil {
            fmt.Println("err:", err)
            return
        }
        // 接受用户请求
        buff := make([]byte, 1024)
        n, err1 := conn.Read(buff)
        if err1 != nil {
            fmt.Println("err1:", err1)
            continue
        }

        fmt.Println("buff = ", string(buff[:n]))
        defer conn.Close() //关闭当前用户链接
    }

}

浏览器访问:http://127.0.0.1:8000/
控制台输出:

GET / HTTP/1.1
Host: 127.0.0.1:8000
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9

响应报文测试方法

package main

import (
    "fmt"
    "net/http"
)

func myHandler(w http.ResponseWriter, r *http.Request) {
    w.Write([]byte("hello go!"))
}

func main() {
    fmt.Println("响应报文格式演示案例")
    http.HandleFunc("/go", myHandler)
    //在指定的地址进行监听
    http.ListenAndServe("127.0.0.1:8000", nil)
}

浏览器访问:http://127.0.0.1:8000/go
页面显示:hello go!

http客户端编程

访问百度

package main

import (
    "fmt"
    "net/http"
)

func main() {
    fmt.Println("http编程演示案例")
    resp, err := http.Get("https://www.baidu.com") //func Get(url string) (resp *Response, err error)
    if err != nil {
        fmt.Printf("err:", err)
        return
    }
    defer resp.Body.Close()
    fmt.Println("Status = ", resp.Status)
    fmt.Println("StatusCode = ", resp.StatusCode)
    fmt.Println("Header = ", resp.Header)
    fmt.Println("Body = ", resp.Body)
    
}

输出结果:

Status =  200 OK
StatusCode =  200
Header =  map[Accept-Ranges:[bytes] Cache-Control:[no-cache] Connection:[Keep-Alive] Content-Length:[227] Content-Type:[text/html] Date:[Wed, 20 Mar 2019 11:10:52 GMT] Etag:["5c7cdb1f-e3"] Last-Modified:[Mon, 04 Mar 2019 08:00:31 GMT] P3p:[CP=" OTI DSP COR IVA OUR IND COM "] Pragma:[no-cache] Server:[BWS/1.1] Set-Cookie:[BD_NOT_HTTPS=1; path=/; Max-Age=300 BIDUPSID=1EB8D042488157FB56779477283469A8; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com PSTM=1553080252; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com] Strict-Transport-Security:[max-age=0] X-Ua-Compatible:[IE=Edge,chrome=1]]
Body =  &{0xc000034080 {0 0} false <nil> 0x60dd50 0x60dcd0}

百度贴吧爬虫

package main

import (
    "fmt"
    "net/http"
    "os"
    "strconv"
    //"os"
)

func main() {
    fmt.Println("百度贴吧爬虫编程演示案例")
    // http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=50
    // http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=100
    var start, end int
    fmt.Printf("输入起始页:")
    fmt.Scan(&start)
    fmt.Printf("输入结束页:")
    fmt.Scan(&end)
    doWork(start, end)
}

func doWork(start, end int) {
    fmt.Printf("正在爬去 %d - %d 页的数据\n", start, end)
    for i := start; i <= end; i++ {
        getPage(i)
    }
}

func getPage(page int) {
    var url string
    url = "http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=" + strconv.Itoa(page*50)
    fmt.Println("url = ", url)
    res, err := httpGet(url)
    if err != nil {
        fmt.Println("err:", err)
        return
    }
    // fmt.Println("res:", res)
    // 把内容写在文件里 1.html
    fileName := strconv.Itoa(page) + ".html"
    f1, err1 := os.Create(fileName)
    if err1 != nil {
        fmt.Println("err1:", err1)
        return
    }
    f1.WriteString(res)

}
func httpGet(url string) (res string, err error) {
    resp, err1 := http.Get(url) //func Get(url string) (resp *Response, err error)
    if err1 != nil {
        fmt.Println("err1:", err1)
        err = err1
        return
    }
    defer resp.Body.Close()

    buff := make([]byte, 1024)
    for {
        n, _ := resp.Body.Read(buff) //(n int, err error)
        // if err2 != nil {
        //  fmt.Println("err2:", err2)
        //  err = err2
        //  return
        // }// 这里竟然不能这么写???
        if n == 0 {
            fmt.Println("读取结束:")
            break
        }
        res += string(buff[:n])

    }

    return

}

并发版网络爬虫:

package main

import (
    "fmt"
    "net/http"
    "os"
    "strconv"
    //"os"
)

var mPage = make(chan int)

func main() {
    fmt.Println("百度贴吧爬虫编程演示案例")
    // http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=50
    // http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=100
    var start, end int
    fmt.Printf("输入起始页:")
    fmt.Scan(&start)
    fmt.Printf("输入结束页:")
    fmt.Scan(&end)
    doWork(start, end)
}

func doWork(start, end int) {
    fmt.Printf("正在爬去 %d - %d 页的数据\n", start, end)

    for i := start; i <= end; i++ {
        go getPage(i)
    }

    for i := start; i <= end; i++ {
        fmt.Printf("第 %d页 爬去完成\n", <-mPage)
    }
}

func getPage(page int) {
    var url string
    url = "http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=" + strconv.Itoa(page*50)
    //fmt.Println("url = ", url)
    fmt.Printf("开始爬去第 %d 页,%s\n", page, url)
    res, err := httpGet(url)
    if err != nil {
        fmt.Println("err:", err)
        return
    }
    mPage <- page
    // fmt.Println("res:", res)
    // 把内容写在文件里 1.html
    fileName := strconv.Itoa(page) + ".html"
    f1, err1 := os.Create(fileName)
    if err1 != nil {
        fmt.Println("err1:", err1)
        return
    }
    f1.WriteString(res)

}
func httpGet(url string) (res string, err error) {
    resp, err1 := http.Get(url) //func Get(url string) (resp *Response, err error)
    if err1 != nil {
        fmt.Println("err1:", err1)
        err = err1
        return
    }
    defer resp.Body.Close()

    buff := make([]byte, 1024)
    for {
        n, _ := resp.Body.Read(buff) //(n int, err error)
        // if err2 != nil {
        //  fmt.Println("err2:", err2)
        //  err = err2
        //  return
        // }// 这里竟然不能这么写???
        if n == 0 {
            //fmt.Println("读取结束")
            break
        }
        res += string(buff[:n])

    }

    return

}

段子爬虫

package main

import (
    "fmt"
    "net/http"
    "os"
    "regexp"
    "strconv"
    //"os"
)

var mPage = make(chan int)

func main() {
    fmt.Println("段子爬虫编程演示案例")
    // http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=50
    // http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=100
    var start, end int

    // fmt.Printf("输入起始页:")
    // fmt.Scan(&start)
    // fmt.Printf("输入结束页:")
    // fmt.Scan(&end)
    start = 1
    end = 1

    doWork(start, end)
}

func doWork(start, end int) {
    fmt.Printf("正在爬去 %d - %d 页的数据\n", start, end)

    for i := start; i <= end; i++ {
        go getPage(i)
    }

    for i := start; i <= end; i++ {
        fmt.Printf("第 %d页 爬去完成\n", <-mPage)
    }
}

func getPage(page int) {
    var url string
    url = "http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=" + strconv.Itoa(page*50)
    url = "https://www.pengfue.com/index_" + strconv.Itoa(page) + ".html"
    //fmt.Println("url = ", url)
    fmt.Printf("开始爬去第 %d 页,%s\n", page, url)
    res, err := httpGet(url)
    if err != nil {
        fmt.Println("err:", err)
        return
    }

    //<a href="https://www.pengfue.com/content_1857662_1.html" target="_blank">游泳圈</a>
    reg := regexp.MustCompile(`<h1 class="dp-b"><a href="(?s:(.*?))"`)
    if reg == nil {
        fmt.Println("regexp error.")
        return
    }
    urls := reg.FindAllStringSubmatch(res, -1)

    fileTitle := make([]string, 0)
    fileContent := make([]string, 0)

    // fmt.Println("urls:", urls)
    for k, v := range urls {
        fmt.Println("k,v:", k, v[1])
        res, err := httpGet(v[1])

        if err != nil {
            fmt.Println("err:", err)
            return
        }

        // 处理title
        reg := regexp.MustCompile(`<h1>(?s:(.*?))</h1>`)
        if reg == nil {
            fmt.Println("regexp error.")
            return
        }
        titles := reg.FindAllStringSubmatch(res, 1)
        for _, title := range titles {
            fmt.Println("title:", title[1]) //this is title.
            // 把内容存储到文件中
            fileTitle = append(fileTitle, title[1])
        }

        // 处理content
        regContent := regexp.MustCompile(`<div class="content-txt pt10">(?s:(.*?))<a id="prev"`)
        if regContent == nil {
            fmt.Println("regexp error.")
            return
        }
        contents := regContent.FindAllStringSubmatch(res, 1)
        for _, content := range contents {
            fmt.Println("content:", content[1]) //this is content.
            // 把内容存储到文件中
            fileContent = append(fileContent, content[1])
        }

    }

    //fmt.Println("fileTitle = ", fileTitle)
    //fmt.Println("fileContent = ", fileContent)

    saveToFile(page, fileTitle, fileContent)

    // fmt.Println("res:", res)

    mPage <- page

}

func saveToFile(page int, fileTitle, fileContent []string) {
    // 把内容写在文件里 1.html
    fileName := strconv.Itoa(page) + ".txt"
    f1, err1 := os.Create(fileName)
    if err1 != nil {
        fmt.Println("err1:", err1)
        return
    }

    defer f1.Close()

    n := len(fileTitle)
    for i := 0; i < n; i++ {
        f1.WriteString(fileTitle[i] + "\n")
        f1.WriteString(fileContent[i] + "\n")
        f1.WriteString("===================================================================\n")
    }

}

func httpGet(url string) (res string, err error) {
    resp, err1 := http.Get(url) //func Get(url string) (resp *Response, err error)
    if err1 != nil {
        fmt.Println("err1:", err1)
        err = err1
        return
    }
    defer resp.Body.Close()

    buff := make([]byte, 1024)
    for {
        n, _ := resp.Body.Read(buff) //(n int, err error)
        // if err2 != nil {
        //  fmt.Println("err2:", err2)
        //  err = err2
        //  return
        // }// 这里竟然不能这么写???
        if n == 0 {
            //fmt.Println("读取结束")
            break
        }
        res += string(buff[:n])

    }

    return

}

还需要处理title和content中包含的特殊格式。

并发的爬虫:

package main

import (
    "fmt"
    "net/http"
    "os"
    "regexp"
    "strconv"
    "strings"
    //"os"
)

var mPage = make(chan int)

func main() {
    fmt.Println("---段子爬虫编程演示案例---")
    // http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=50
    // http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=100
    var start, end int

    fmt.Printf("输入起始页:")
    fmt.Scan(&start)
    fmt.Printf("输入结束页:")
    fmt.Scan(&end)
    // start = 1
    // end = 1

    doWork(start, end)
}

func doWork(start, end int) {
    fmt.Printf("正在爬去 %d - %d 页的数据...\n", start, end)

    for i := start; i <= end; i++ {
        go getPage(i)
    }

    for i := start; i <= end; i++ {
        fmt.Printf("第【%d】页 爬取完成\n", <-mPage)
    }
}

func getPage(page int) {
    var url string
    url = "http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=" + strconv.Itoa(page*50)
    url = "https://www.pengfue.com/index_" + strconv.Itoa(page) + ".html"
    //fmt.Println("url = ", url)
    fmt.Printf("开始爬去第【%d】页:%s\n", page, url)
    res, err := httpGet(url)
    if err != nil {
        fmt.Println("err:", err)
        return
    }

    //<a href="https://www.pengfue.com/content_1857662_1.html" target="_blank">游泳圈</a>
    reg := regexp.MustCompile(`<h1 class="dp-b"><a href="(?s:(.*?))"`)
    if reg == nil {
        fmt.Println("regexp error.")
        return
    }
    urls := reg.FindAllStringSubmatch(res, -1)

    fileTitle := make([]string, 0)
    fileContent := make([]string, 0)
    fileUrl := make([]string, 0)
    // fmt.Println("urls:", urls)
    for k, v := range urls {
        fmt.Println("url:", k, v[1])
        res, err := httpGet(v[1])

        if err != nil {
            fmt.Println("err:", err)
            return
        }

        // 处理title
        reg := regexp.MustCompile(`<h1>(?s:(.*?))</h1>`)
        if reg == nil {
            fmt.Println("regexp error.")
            return
        }
        titles := reg.FindAllStringSubmatch(res, 1)
        var tempTitle string   //用于处理title
        var tempContent string //用于处理content
        for _, title := range titles {

            // 把内容存储到文件中
            tempTitle = title[1]
            tempTitle = strings.Replace(tempTitle, "\r", "", -1)
            tempTitle = strings.Replace(tempTitle, "\n", "", -1)
            tempTitle = strings.Replace(tempTitle, " ", "", -1)
            tempTitle = strings.Replace(tempTitle, "\t", "", -1)
            fmt.Println("title:", tempTitle) //this is title.
            fileTitle = append(fileTitle, tempTitle)
        }

        // 处理content
        regContent := regexp.MustCompile(`<div class="content-txt pt10">(?s:(.*?))<a id="prev"`)
        if regContent == nil {
            fmt.Println("regexp error.")
            return
        }
        contents := regContent.FindAllStringSubmatch(res, 1)
        for _, content := range contents {

            // 把内容存储到文件中

            tempContent = content[1]
            // tempContent = strings.Replace(tempContent, "\r", "", -1)
            tempContent = strings.Replace(tempContent, "\n", "", -1)
            // tempContent = strings.Replace(tempContent, " ", "", -1)
            tempContent = strings.Replace(tempContent, "\t", "", -1)
            fmt.Println("content:", tempContent) //this is content.
            fileContent = append(fileContent, tempContent)
        }
        // 处理url
        fileUrl = append(fileUrl, v[1])

    }

    //fmt.Println("fileTitle = ", fileTitle)
    //fmt.Println("fileContent = ", fileContent)

    //saveToFile(page, fileTitle, fileContent)
    saveToFileWithUrl(page, fileTitle, fileContent, fileUrl)
    // fmt.Println("res:", res)

    mPage <- page

}

func saveToFile(page int, fileTitle, fileContent []string) {
    // 把内容写在文件里 1.html
    fileName := strconv.Itoa(page) + ".txt"
    f1, err1 := os.Create(fileName)
    if err1 != nil {
        fmt.Println("err1:", err1)
        return
    }

    defer f1.Close()

    n := len(fileTitle)
    for i := 0; i < n; i++ {
        f1.WriteString(fileTitle[i] + "\n")
        f1.WriteString(fileContent[i] + "\n")
        f1.WriteString("===================================================================\n")
    }

}
func saveToFileWithUrl(page int, fileTitle, fileContent, fileUrl []string) {
    // 把内容写在文件里 1.html
    fileName := strconv.Itoa(page) + ".txt"
    f1, err1 := os.Create(fileName)
    if err1 != nil {
        fmt.Println("err1:", err1)
        return
    }

    defer f1.Close()

    n := len(fileTitle)
    for i := 0; i < n; i++ {
        f1.WriteString(fileTitle[i] + "\n")
        f1.WriteString(fileContent[i] + "\n")
        f1.WriteString(fileUrl[i] + "\n")
        f1.WriteString("===================================================================\n")
    }

}

func httpGet(url string) (res string, err error) {
    resp, err1 := http.Get(url) //func Get(url string) (resp *Response, err error)
    if err1 != nil {
        fmt.Println("err1:", err1)
        err = err1
        return
    }
    defer resp.Body.Close()

    buff := make([]byte, 1024)
    for {
        n, _ := resp.Body.Read(buff) //(n int, err error)
        // if err2 != nil {
        //  fmt.Println("err2:", err2)
        //  err = err2
        //  return
        // }// 这里竟然不能这么写???
        if n == 0 {
            //fmt.Println("读取结束")
            break
        }
        res += string(buff[:n])

    }

    return

}

END.

上一篇下一篇

猜你喜欢

热点阅读