golang爬虫框架colly

colly.png

colly一款快速优雅的golang爬虫框架，简单易用，功能完备。

官网地址：http://go-colly.org/

包地址：import "github.com/gocolly/colly"

一个简单的例子：

package main

import (
    "fmt"
    "github.com/gocolly/colly"
)

func main() {
    c := colly.NewCollector()

    c.OnHTML("a", func(e *colly.HTMLElement) {
        e.Request.Visit(e.Attr("href"))
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    c.Visit("http://go-colly.org/")
}

使用方式概括下来主要有三步：

创建一个采集器
注册回调函数
访问具体网站

创建采集器时可以指定一些配置参数，如useragent，爬取深度及日志等

colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"),
 colly.MaxDepth(1), 
colly.Debugger(&debug.LogDebugger{}))

回调函数共有7中

名称	说明	参数1	参数2
OnRequest	请求前调用	*colly.Request
OnError	请求发生错误时调用	*colly.Response	error
OnResponseHeaders	收到响应头后调用	colly.Response
OnResponse	收到响应后调用	colly.Response
OnHTML	响应内容是HTML时调用	xpath表达式	func(e *colly.HTMLElement)
OnXML	响应内容是XML时调用	xpath表达式	func(e *colly.XMLElement)
OnScraped	在OnXML之后调用	func(r *colly.Response)

OnHTML回调可以注册多个，匹配不同的xpath表达式

1. 爬取首页文章列表

通过浏览器开发者工具查看jianshu.com结构如下

colly-jianshu-dom.png

文章列表为ul标签，中间每一项是li标签，li中包含content，content中包含title，abstract和meta标签

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
)

func main() {
    c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
        //文章列表
    c.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
            //列表中每一项
            e.ForEach("li", func(i int, item *colly.HTMLElement) {
            //文章链接
            href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
            //文章标题
            title := item.ChildText("div[class='content'] > a[class='title']")
            //文章摘要
            summary := item.ChildText("div[class='content'] > p[class='abstract']")
            fmt.Println(title, href)
            fmt.Println(summary)
            fmt.Println()
        })
    })

    err := c.Visit("https://www.jianshu.com")
    if err != nil {
        fmt.Println(err.Error())
    }
}

2.爬取文章列表和详情

文章列表和1方式一样，文章详情通过创建新的采集器访问详情页面

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "time"
)

func main() {
    c1 := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1))
    c2 := c1.Clone()

    //异步
    c2.Async = true
    //限速
    c2.Limit(&colly.LimitRule{
        DomainRegexp: "",
        DomainGlob:   "*.jianshu.com/p/*",
        Delay:        10 * time.Second,
        RandomDelay:  0,
        Parallelism:  1,
    })
    //采集器1，获取文章列表
    c1.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
        e.ForEach("li", func(i int, item *colly.HTMLElement) {
            href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
            title := item.ChildText("div[class='content'] > a[class='title']")
            summary := item.ChildText("div[class='content'] > p[class='abstract']")
            ctx := colly.NewContext()
            ctx.Put("href", href)
            ctx.Put("title", title)
            ctx.Put("summary", summary)
            //通过Context上下文对象将采集器1采集到的数据传递到采集器2
            c2.Request("GET", "https://www.jianshu.com" + href, nil, ctx, nil)
        })
    })
    //采集器2，获取文章详情
    c2.OnHTML("article", func(e *colly.HTMLElement) {
        href := e.Request.Ctx.Get("href")
        title := e.Request.Ctx.Get("title")
        summary := e.Request.Ctx.Get("summary")
        detail := e.Text

        fmt.Println("----------" + title + "----------")
        fmt.Println(href)
        fmt.Println(summary)
        fmt.Println(detail)
        fmt.Println()
    })

    c2.OnRequest(func(r *colly.Request) {
        fmt.Println("c2爬取页面：", r.URL)
    })

    c1.OnRequest(func(r *colly.Request) {
        fmt.Println("c1爬取页面：", r.URL)
    })

    c1.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })

    err := c1.Visit("https://www.jianshu.com")
    if err != nil {
        fmt.Println(err.Error())
    }

    c2.Wait()
}

3. 爬取需要登录的网页

官网提供登录页处理的例子，但是大多数涉及验证码，不好处理，目前方式是手动登录，复制cookie写到爬虫请求头里

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
    "github.com/gocolly/colly/extensions"
    _ "github.com/gocolly/colly/extensions"
    "net/http"
)

func main() {
    url := "https://www.jianshu.com"

    c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
    c.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
        e.ForEach("li", func(i int, item *colly.HTMLElement) {
            href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
            title := item.ChildText("div[class='content'] > a[class='title']")
            summary := item.ChildText("div[class='content'] > p[class='abstract']")
            fmt.Println(title, href)
            fmt.Println(summary)
            fmt.Println()
        })
    })

    //设置随机useragent
    extensions.RandomUserAgent(c)
    //设置登录cookie
    c.SetCookies(url, []*http.Cookie{
        &http.Cookie{
            Name:     "remember_user_token",
            Value:    "wNDUxOV0sIiQyYSQxMSRwdkhqWVhHYmxXaDJ6dEU3NzJwbmsuIiwiMTU",
            Path:     "/",
            Domain:   ".jianshu.com",
            Secure:   true,
            HttpOnly: true,
        },
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("爬取页面：", r.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })
    err := c.Visit(url)
    if err != nil {
        fmt.Println(err.Error())
    }
}

4. 内存任务队列

将需要爬取的连接放入队列中，设置队列并发数，可以并行爬取连接

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
    "github.com/gocolly/colly/queue"
)

func main() {
    c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(3), colly.Debugger(&debug.LogDebugger{}))

//创建内存队列，大小10000，goroutine数量 5
    q, _ := queue.New(5, &queue.InMemoryQueueStorage{MaxSize: 10000})

    c.OnHTML("a", func(element *colly.HTMLElement) {
        element.Request.Visit(element.Attr("href"))
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("爬取页面：", r.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })
    q.AddURL("https://www.jianshu.com")

    q.Run(c)
}

5. redis任务队列

设置redis存储后，队列中URL存储在redis中，访问页面的cookie及访问记录也会保存在redis中

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
    "github.com/gocolly/colly/queue"
    "github.com/gocolly/redisstorage"
)

func main() {
    c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(3), colly.Debugger(&debug.LogDebugger{}))

    storage := &redisstorage.Storage{
        Address:  "192.168.1.10:6379",
        Password: "123456",
        DB:       0,
        Prefix:   "colly",
        Client:   nil,
        Expires:  0,
    }

    c.SetStorage(storage)

    err := storage.Clear()
    if err != nil{
        panic(err)
    }

    defer storage.Client.Close()

    q, _ := queue.New(5, storage)

    c.OnHTML("a", func(element *colly.HTMLElement) {
        element.Request.Visit(element.Attr("href"))
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("爬取页面：", r.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
    })
    q.AddURL("https://www.jianshu.com")

    q.Run(c)
}

redis中数据

6.配置代理

package main

import (
    "bytes"
    "log"

    "github.com/gocolly/colly"
    "github.com/gocolly/colly/proxy"
)

func main() {
    c := colly.NewCollector()

    //配置两个代理
    rp, err := proxy.RoundRobinProxySwitcher("http://127.0.0.1:1080", "socks5://127.0.0.1:1338")
    if err != nil {
        log.Fatal(err)
    }
    c.SetProxyFunc(rp)

    c.OnResponse(func(r *colly.Response) {
        log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
    })

    c.Visit("https://httpbin.org/ip")
}