###Golang爬虫库
goquery
###goquery安装
在cmd中执行下面的go命令:
go get github.com/PuerkitoBio/goquery
###内涵段子
func GetJokes3() {
q, err := goquery.NewDocument("http://neihanshequ.com/")
if err != nil{
fmt.Println("document error")
}
q.Find(".site-width .options").Each(func(i int, selection *goquery.Selection) {
text, _ := selection.Find(".share-wrapper").Attr("data-text")
u, _ := selection.Find(".share-wrapper").Attr("data-url")
p, _ := selection.Find(".share-wrapper").Attr("data-pic")
d := selection.Find(".digg-wrapper .digg").Text()
fmt.Println("开始了,结果是",text)
fmt.Println("开始了,结果是",u)
fmt.Println("开始了,结果是",p)
fmt.Println("开始了,结果是",d)
})
}
###中文幽默王
func GetJokes5() {
for i := 0; i < 10; i++ {
//发起get请求获得html网页
u := "http://www.haha365.com/joke/index_"
u += fmt.Sprintf("%d", i)
u += ".htm"
fmt.Println("地址是:", u)
resp, _ := http.Get(u)
//读取网页数据
defer resp.Body.Close()
body , _ := ioutil.ReadAll(resp.Body)
//由于网页是gbk, 而goquery只支持utf-8,所以将网页数据转码成gbk
ecd := mahonia.NewDecoder("gbk")
str := ecd.ConvertString(string(body))
//转义字符:由于网页中存在“”等转义字符,所以将它转义成我们要的字符
str = html.UnescapeString(str)
//fmt.Println(str)
//fmt.Println("====================================")
//利用goquery.NewDocumentFromReader()将网页读取到doc中
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(str))
//获取得到的doc得到对应的selector, 并且读取出来
doc.Find("")
doc.Find("#endtext").Each(func(i int, selection *goquery.Selection) {
//拿到字符串
text := selection.Text()
fmt.Println(text)
})
}
}
###糗事百科
func GetJokes2() {
fmt.Println("开始爬取糗事百科热点笑话...")
js, err := goquery.NewDocument("https://www.qiushibaike.com/hot/")
if err != nil {
log.Fatal(err)
}
js.Find("#content-left .article").Each(func(i int, contentSelection *goquery.Selection) {
//先判断是否有图片
img, _ := contentSelection.Find(".thumb img").Attr("src")
if len(img) == 0 {
hotsArt := HotsContent{}
content := contentSelection.Find(".content span").Text()
url, _ := contentSelection.Find(".contentHerf").Attr("href")
comment_name := contentSelection.Find(".cmtMain .cmt-name").Text()
comment_cont := contentSelection.Find(".cmtMain .main-text").Text()
hotsArt.num = i + 1
hotsArt.url = "https://www.qiushibaike.com" + url
hotsArt.content = strings.Replace(content, "\n", "", -1)
hotsArt.comment = strings.Replace(comment_name+comment_cont, "\n", "", -1)
fmt.Println("第", hotsArt.num, "个笑话:")
fmt.Println("\t", hotsArt.content)
fmt.Println("\t 最热评论:" + hotsArt.comment)
fmt.Println("\t 地址", hotsArt.url)
count := contentSelection.Find(".stats .stats-vote .number").Text()
fmt.Println("\t点赞数", count)
fmt.Println("======================================================")
}else{
hotsArt := HotsContent{}
content := contentSelection.Find(".content span").Text()
url, _ := contentSelection.Find(".contentHerf").Attr("href")
comment_name := contentSelection.Find(".cmtMain .cmt-name").Text()
comment_cont := contentSelection.Find(".cmtMain .main-text").Text()
hotsArt.num = i + 1
hotsArt.url = "https://www.qiushibaike.com" + url
hotsArt.content = strings.Replace(content, "\n", "", -1)
hotsArt.comment = strings.Replace(comment_name+comment_cont, "\n", "", -1)
fmt.Println("第", hotsArt.num, "个笑话:")
fmt.Println("\t", hotsArt.content)
fmt.Println("\t 最热评论:" + hotsArt.comment)
fmt.Println("\t 地址", hotsArt.url)
fmt.Println("\t图片地址", img)
count := contentSelection.Find(".stats .stats-vote .number").Text()
fmt.Println("\t点赞数", count)
fmt.Println("======================================================")
}
})
}