Go小项目-扫描子域名

2018-03-15  本文已影响44人  c4a1d989518e

项目结果

代码的功能
运行结果

项目代码

代码为

package main

import (
    "encoding/xml"
    "flag"
    "fmt"
    "io"
    "net/http"
    "net/url"
    "os"
    "strings"

    "github.com/gophercises/link"
)

/*
   1. GET the webpage
   2. parse all the links on the page
   3. build proper urls with our links
   4. filter out any links w/ a diff domain
   5. Find all pages (BFS)
   6. print out XML
*/

const xmlns = "http://www.sitemaps.org/schemas/sitemap/0.9"

type loc struct {
    Value string `xml:"loc"`
}

type urlset struct {
    Urls  []loc  `xml:"url"`
    Xmlns string `xml:"xmlns,attr"`
}

func main() {
    urlFlag := flag.String("url", "https://gophercises.com", "the url that you want to build a sitemap for")
    maxDepth := flag.Int("depth", 10, "the maximum number of links deep to traverse")
    flag.Parse()

    pages := bfs(*urlFlag, *maxDepth)
    toXml := urlset{
        Xmlns: xmlns,
    }
    for _, page := range pages {
        toXml.Urls = append(toXml.Urls, loc{page})
    }

    fmt.Print(xml.Header)
    enc := xml.NewEncoder(os.Stdout)
    enc.Indent("", "  ")
    if err := enc.Encode(toXml); err != nil {
        panic(err)
    }
    fmt.Println()
}

func bfs(urlStr string, maxDepth int) []string {
    seen := make(map[string]struct{})
    var q map[string]struct{}
    nq := map[string]struct{}{
        urlStr: struct{}{},
    }
    for i := 0; i <= maxDepth; i++ {
        q, nq = nq, make(map[string]struct{})
        if len(q) == 0 {
            break
        }
        for url, _ := range q {
            if _, ok := seen[url]; ok {
                continue
            }
            seen[url] = struct{}{}
            for _, link := range get(url) {
                nq[link] = struct{}{}
            }
        }
    }
    ret := make([]string, 0, len(seen))
    for url, _ := range seen {
        ret = append(ret, url)
    }
    return ret
}

func get(urlStr string) []string {
    resp, err := http.Get(urlStr)
    if err != nil {
        return []string{}
    }
    defer resp.Body.Close()
    reqUrl := resp.Request.URL
    baseUrl := &url.URL{
        Scheme: reqUrl.Scheme,
        Host:   reqUrl.Host,
    }
    base := baseUrl.String()
    return filter(hrefs(resp.Body, base), withPrefix(base))
}

func hrefs(r io.Reader, base string) []string {
    links, _ := link.Parse(r)
    var ret []string
    for _, l := range links {
        switch {
        case strings.HasPrefix(l.Href, "/"):
            ret = append(ret, base+l.Href)
        case strings.HasPrefix(l.Href, "http"):
            ret = append(ret, l.Href)
        }
    }
    return ret
}

func filter(links []string, keepFn func(string) bool) []string {
    var ret []string
    for _, link := range links {
        if keepFn(link) {
            ret = append(ret, link)
        }
    }
    return ret
}

func withPrefix(pfx string) func(string) bool {
    return func(link string) bool {
        return strings.HasPrefix(link, pfx)
    }
}

参考链接

https://github.com/gophercises/sitemap/tree/solution

上一篇下一篇

猜你喜欢

热点阅读