GO语言爬虫爬取豆瓣电影top250存入excel

package main

import (
"fmt"
"github.com/tealeg/xlsx"
"io/ioutil"
"net/http"
"regexp"
"strconv"
"time"
)
//定义新的数据类型
type Spider struct {
url string
header map[string]string
}


type Films struct {
rows string
name string
scores string
scores_pepoles string

}

//定义 Spider get的方法
func (keyword Spider) get_html_header() string {
client := &http.Client{}
req, err := http.NewRequest("GET", keyword.url, nil)
if err != nil {
}
for key, value := range keyword.header {
req.Header.Add(key, value)
}
resp, err := client.Do(req)
if err != nil {
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
}
return string(body)

}
func parse() {

//xlsx 创建xlsx
file := xlsx.NewFile()
sheet,_ := file.AddSheet("sheet1")
row := sheet.AddRow()
cell := row.AddCell()
cell.Value = "页码"
cell = row.AddCell()
cell.Value = "电影名称"
cell = row.AddCell()
cell.Value = "评分"
cell = row.AddCell()
cell.Value = "评价人数"


films := getFilms()
//add data
for _, film := range films{
row := sheet.AddRow()
rowsCell := row.AddCell()
rowsCell.Value = film.rows

nameCell := row.AddCell()
nameCell.Value = film.name

scoresCell := row.AddCell()
scoresCell.Value = film.scores

scores_pepolesCell := row.AddCell()
scores_pepolesCell.Value = film.scores_pepoles
}
err := file.Save("C:/file.xlsx")
if err != nil {
fmt.Printf(err.Error())
}
}

func getFilms()[]Films {
films := make([]Films, 0)
header := map[string]string{
"Host": "movie.douban.com",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Referer": "https://movie.douban.com/top250",
}
for i := 0; i < 10; i++ {
fmt.Println("正在抓取第" + strconv.Itoa(i) + "页......")
url := "https://movie.douban.com/top250?start=" + strconv.Itoa(i*25) + "&filter="
spider := &Spider{url, header}
html := spider.get_html_header()

//评价人数
pattern2 := `(.*?)人评价`
rp2 := regexp.MustCompile(pattern2)
find_txt2 := rp2.FindAllStringSubmatch(html, -1)

//评分
pattern3 := `property="v:average">(.*?)`
rp3 := regexp.MustCompile(pattern3)
find_txt3 := rp3.FindAllStringSubmatch(html, -1)

//电影名称
pattern4 := `"title">([^ ].*?)`
rp4 := regexp.MustCompile(pattern4)
find_txt4 := rp4.FindAllStringSubmatch(html, -1)

for j := 0; j < len(find_txt2); j++ {
film := Films{}
film.rows = strconv.Itoa(i+1)
film.name = find_txt4[j][1]
film.scores = find_txt3[j][1]
film.scores_pepoles = find_txt2[j][1]
films = append(films, film)

}
}
return films

}






func main() {

t1 := time.Now() // get current time
parse()
elapsed := time.Since(t1)

fmt.Println("爬虫结束,总共耗时: ", elapsed)

}

转载于:https://www.cnblogs.com/niulanshandeniu/p/11277380.html

你可能感兴趣的:(爬虫,xhtml)