colly一款快速优雅的golang爬虫框架,简单易用,功能完备。
官网地址:http://go-colly.org/
包地址:import "github.com/gocolly/colly"
一个简单的例子:
package main
import (
"fmt"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector()
c.OnHTML("a", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.Visit("http://go-colly.org/")
}
使用方式概括下来主要有三步:
- 创建一个采集器
- 注册回调函数
- 访问具体网站
创建采集器时可以指定一些配置参数,如useragent,爬取深度及日志等
colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"),
colly.MaxDepth(1),
colly.Debugger(&debug.LogDebugger{}))
回调函数共有7中
名称 | 说明 | 参数1 | 参数2 |
---|---|---|---|
OnRequest | 请求前调用 | *colly.Request | |
OnError | 请求发生错误时调用 | *colly.Response | error |
OnResponseHeaders | 收到响应头后调用 | colly.Response | |
OnResponse | 收到响应后调用 | colly.Response | |
OnHTML | 响应内容是HTML时调用 | xpath表达式 | func(e *colly.HTMLElement) |
OnXML | 响应内容是XML时调用 | xpath表达式 | func(e *colly.XMLElement) |
OnScraped | 在OnXML之后调用 | func(r *colly.Response) |
OnHTML回调可以注册多个,匹配不同的xpath表达式
1. 爬取首页文章列表
通过浏览器开发者工具查看jianshu.com结构如下
文章列表为ul标签,中间每一项是li标签,li中包含content,content中包含title,abstract和meta标签
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
)
func main() {
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
//文章列表
c.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
//列表中每一项
e.ForEach("li", func(i int, item *colly.HTMLElement) {
//文章链接
href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
//文章标题
title := item.ChildText("div[class='content'] > a[class='title']")
//文章摘要
summary := item.ChildText("div[class='content'] > p[class='abstract']")
fmt.Println(title, href)
fmt.Println(summary)
fmt.Println()
})
})
err := c.Visit("https://www.jianshu.com")
if err != nil {
fmt.Println(err.Error())
}
}
2.爬取文章列表和详情
文章列表和1方式一样,文章详情通过创建新的采集器访问详情页面
package main
import (
"fmt"
"github.com/gocolly/colly"
"time"
)
func main() {
c1 := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1))
c2 := c1.Clone()
//异步
c2.Async = true
//限速
c2.Limit(&colly.LimitRule{
DomainRegexp: "",
DomainGlob: "*.jianshu.com/p/*",
Delay: 10 * time.Second,
RandomDelay: 0,
Parallelism: 1,
})
//采集器1,获取文章列表
c1.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
e.ForEach("li", func(i int, item *colly.HTMLElement) {
href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
title := item.ChildText("div[class='content'] > a[class='title']")
summary := item.ChildText("div[class='content'] > p[class='abstract']")
ctx := colly.NewContext()
ctx.Put("href", href)
ctx.Put("title", title)
ctx.Put("summary", summary)
//通过Context上下文对象将采集器1采集到的数据传递到采集器2
c2.Request("GET", "https://www.jianshu.com" + href, nil, ctx, nil)
})
})
//采集器2,获取文章详情
c2.OnHTML("article", func(e *colly.HTMLElement) {
href := e.Request.Ctx.Get("href")
title := e.Request.Ctx.Get("title")
summary := e.Request.Ctx.Get("summary")
detail := e.Text
fmt.Println("----------" + title + "----------")
fmt.Println(href)
fmt.Println(summary)
fmt.Println(detail)
fmt.Println()
})
c2.OnRequest(func(r *colly.Request) {
fmt.Println("c2爬取页面:", r.URL)
})
c1.OnRequest(func(r *colly.Request) {
fmt.Println("c1爬取页面:", r.URL)
})
c1.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
err := c1.Visit("https://www.jianshu.com")
if err != nil {
fmt.Println(err.Error())
}
c2.Wait()
}
3. 爬取需要登录的网页
官网提供登录页处理的例子,但是大多数涉及验证码,不好处理,目前方式是手动登录,复制cookie写到爬虫请求头里
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
"github.com/gocolly/colly/extensions"
_ "github.com/gocolly/colly/extensions"
"net/http"
)
func main() {
url := "https://www.jianshu.com"
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
c.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
e.ForEach("li", func(i int, item *colly.HTMLElement) {
href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
title := item.ChildText("div[class='content'] > a[class='title']")
summary := item.ChildText("div[class='content'] > p[class='abstract']")
fmt.Println(title, href)
fmt.Println(summary)
fmt.Println()
})
})
//设置随机useragent
extensions.RandomUserAgent(c)
//设置登录cookie
c.SetCookies(url, []*http.Cookie{
&http.Cookie{
Name: "remember_user_token",
Value: "wNDUxOV0sIiQyYSQxMSRwdkhqWVhHYmxXaDJ6dEU3NzJwbmsuIiwiMTU",
Path: "/",
Domain: ".jianshu.com",
Secure: true,
HttpOnly: true,
},
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("爬取页面:", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
err := c.Visit(url)
if err != nil {
fmt.Println(err.Error())
}
}
4. 内存任务队列
将需要爬取的连接放入队列中,设置队列并发数,可以并行爬取连接
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
"github.com/gocolly/colly/queue"
)
func main() {
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(3), colly.Debugger(&debug.LogDebugger{}))
//创建内存队列,大小10000,goroutine数量 5
q, _ := queue.New(5, &queue.InMemoryQueueStorage{MaxSize: 10000})
c.OnHTML("a", func(element *colly.HTMLElement) {
element.Request.Visit(element.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("爬取页面:", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
q.AddURL("https://www.jianshu.com")
q.Run(c)
}
5. redis任务队列
设置redis存储后,队列中URL存储在redis中,访问页面的cookie及访问记录也会保存在redis中
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
"github.com/gocolly/colly/queue"
"github.com/gocolly/redisstorage"
)
func main() {
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(3), colly.Debugger(&debug.LogDebugger{}))
storage := &redisstorage.Storage{
Address: "192.168.1.10:6379",
Password: "123456",
DB: 0,
Prefix: "colly",
Client: nil,
Expires: 0,
}
c.SetStorage(storage)
err := storage.Clear()
if err != nil{
panic(err)
}
defer storage.Client.Close()
q, _ := queue.New(5, storage)
c.OnHTML("a", func(element *colly.HTMLElement) {
element.Request.Visit(element.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("爬取页面:", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
q.AddURL("https://www.jianshu.com")
q.Run(c)
}
6.配置代理
package main
import (
"bytes"
"log"
"github.com/gocolly/colly"
"github.com/gocolly/colly/proxy"
)
func main() {
c := colly.NewCollector()
//配置两个代理
rp, err := proxy.RoundRobinProxySwitcher("http://127.0.0.1:1080", "socks5://127.0.0.1:1338")
if err != nil {
log.Fatal(err)
}
c.SetProxyFunc(rp)
c.OnResponse(func(r *colly.Response) {
log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
})
c.Visit("https://httpbin.org/ip")
}