很久没写什么代码,除了应付学校的期末大作业,一直都浑浑噩噩的,然后今天看到了Go语言的爬虫,好像还挺有趣的手痒痒。不过好久没写go了,还是花了一点时间来回忆的。
写了两段代码,打算比较一下python和go的爬虫速度
目标网站: 性感美女图
其实原准备爬取豆瓣250,可是实力确实太菜,一开始用go爬取忘记添加请求头,直接被封ip了,没办法选择了这个反爬措施比较少的网站。然后主要比较速度,所以只爬取了标题,没有获取图片(存这些图片怕身体吃不消)
python版本 |
import requests
from lxml import etree
import time
def get_title():
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Host": "www.meituba.com"
}
for page in range(1,11):
url=f"http://www.meituba.com/xinggan/list8{page}.html"
response=requests.get(url,headers=headers)
response.encoding='utf-8'
bs = etree.HTML(response.text)
items = bs.xpath('//div[@class="channel_list"]/ul/li')
for i in range(len(items)):
title = items[i].xpath('./a/text()')[0]
print(f"{(page-1)*len(items)+i+1}:",title)
if __name__ == '__main__':
#proxies ={'https': '183.185.185.64:9797'}
begin=time.time()
get_title()
end=time.time()
spendTime=end-begin
print("花费的时间为:",spendTime)
package main
import (
"fmt"
"log"
"net/http"
"strconv"
"time"
"github.com/goquery"
)
func main() {
client := &http.Client{}
begin := time.Now()
for page := 1; page <= 10; page++ {
url := "http://www.meituba.com/xinggan/list8" + strconv.Itoa(page) + ".html"
request, err := http.NewRequest("GET", url, nil)
request.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36")
request.Header.Add("Host", "www.meituba.com")
response, err := client.Do(request)
opending, err := goquery.NewDocumentFromResponse(response)
if err != nil {
log.Fatal(err)
}
ele := opending.Find(".imgc")
ele.Each(func(index int, content *goquery.Selection) {
name, _ := content.Find("a").First().Attr("title")
fmt.Printf("%d: %s\n", index+1+(page-1)*16, name)
})
}
end := time.Now()
spendTime := end.Sub(begin)
fmt.Println("花费时间为:", spendTime)
}
然后进阶一下,使用channel和goroutine试试能不能继续提速
将请求网页返回的页面放进channel,然后一边获得页面,一边解析页面
package main
import (
"fmt"
"io/ioutil"
"net/http"
"strings"
"time"
"github.com/goquery"
)
func getBody(url string, chanWeb chan string) {
client := &http.Client{}
request, _ := http.NewRequest("GET", url, nil)
request.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36")
request.Header.Add("Host", "www.meituba.com")
response, _ := client.Do(request)
body, _ := ioutil.ReadAll(response.Body)
chanWeb <- string(body)
}
func getTitle(body string, chanR chan string) {
dom, err := goquery.NewDocumentFromReader(strings.NewReader(body))
if err != nil {
fmt.Println(err)
}
dom.Find(".imgc").Each(func(i int, selection *goquery.Selection) {
title, _ := selection.Find("a").First().Attr("title")
chanR <- title
})
}
func main() {
begin := time.Now()
baseUrl := "http://www.meituba.com/xinggan/list8%d.html"
chanWeb := make(chan string)
defer close(chanWeb)
chanR := make(chan string)
defer close(chanR)
for i := 1; i <= 10; i++ {
url := fmt.Sprintf(baseUrl, i*16)
go getBody(url, chanWeb)
}
go func() {
for web_content := range <-chanWeb {
go getTitle(string(web_content), chanR)
}
}()
count := 0
for res := range <-chanR {
count += 1
fmt.Printf("%d:%s\n", count, res)
}
end := time.Now()
spendTime := end.Sub(begin)
fmt.Println("花费的时间为:", spendTime)
}
所以go以速度快(高并发)出名还是有道理的,在数据量越来越大的时代,程序的处理速度也得跟上,习惯了使用简单的python可远远不够,还是应该继续学习新知识。其实比起C语言来说,go还是简单很多的,虽然没有python那么容易上手,但是熟悉之后会喜欢上的。