go爬虫 goquery+正则

用了goquery 和regexp两个包

用法如query:

			dom,err:=goquery.NewDocumentFromReader(strings.NewReader(result))
			if err!=nil{
				fmt.Println("HttpGet err :",err)
			}

			dom.Find(".Programlist .Cont ul p").Each(func(i int, selection *goquery.Selection) {
				// if selection != nil {
					title  += selection.Text() + "\r"
					titleS = append(titleS,selection.Text())
				// }
			})

regexp用法:

        		// fmt.Println(video_cont)
				rel2 := regexp.MustCompile(`"title":"流畅","url":"(.*?)"`)
				if rel2 == nil {
					fmt.Println("准备好了12")
				}
				arr2 := rel2.FindAllStringSubmatch(video_cont,-1) //获取的数据是二维的切片

 

package main 
import (
	"fmt"
	"strconv"
	"net/http"
	"os"
	"regexp"
	"strings"
	"github.com/PuerkitoBio/goquery"  
	"database/sql"
    _ "github.com/go-sql-driver/mysql"
)
// type collectionmwd struct {
//     ID int64 `db:"id"`
//     title string `db:title`
//     cover string  `db:"cover"` //由于在mysql的users表中name没有设置为NOT NULL,所以name可能为null,在查询过程中会返回nil,如果是string类型则无法接收nil,但sql.NullString则可以接收nil值
//     videoUrl string `db:"videourl"`
// }
const (
    USERNAME = "root"
    PASSWORD = "root"
    NETWORK  = "tcp"
    SERVER   = "localhost"
    PORT     = 3306
    DATABASE = "guanfu_school"
)
func main () {
	var start ,end int 
	fmt.Printf("请输入起始页:(2013开始,2019结束)")
	fmt.Scan(&start)
	fmt.Printf("请输入结束页:(2013开始,2019结束)")
	fmt.Scan(&end)
	// image := []string{}
	// fmt.Println(len(image))
	Dowork(start,end)
	
	// title := []string{"12321321","dwefdsfsd","萨芬就开始放假都是放到数据库"}
	// path  := []string{"12321321","dwefdsfsd","萨芬就开始放假都是放到数据库"}
	// fmt.Println(image[2])
	// fmt.Println(title[2])
	// fmt.Println(path[2])
	// insertData(image,title,path)
}

func Dowork (start , end int) {
	// return
	fmt.Println("正在爬取数据")
    // var title string
	//开始循环每个年份 
	//然后每个年份再循环找每一页的数据
	for i:=start;i<=end;i++ {
		for j:=1;j<=3;j++ {
			//写入文件
			var img string
			var title string 
			var path string 
			//插入数据库数据
			var imgS   = make([]string,0)
			var titleS = make([]string,0)
			var pathS  = make([]string,0)

			var url = fmt.Sprintf("http://vod.gxtv.cn/program/28/%s/%s.html",strconv.Itoa(i),strconv.Itoa(j)) 
			//获取一年中每页的内容
			result,err := HttpGet(url)
			if err != nil {
				fmt.Println("HttpGet err :",err)
				break
			}
			// 过滤标题
			dom,err:=goquery.NewDocumentFromReader(strings.NewReader(result))
			if err!=nil{
				fmt.Println("HttpGet err :",err)
			}

			dom.Find(".Programlist .Cont ul p").Each(func(i int, selection *goquery.Selection) {
				// if selection != nil {
					title  += selection.Text() + "\r"
					titleS = append(titleS,selection.Text())
				// }
			})
		    fmt.Println("title:",title)
		    fmt.Println("titleS",titleS)
		       
			// 过滤封面url
			imgReg := regexp.MustCompile(` 1 {
					img += data[1] + "\r"
					imgS = append(imgS,data[1])
				}
			}
			fmt.Println("img :",img)
			fmt.Println("imgS :",imgS)

			//找到详情页路径  再爬取代码
			rel := regexp.MustCompile(`
  •  

    你可能感兴趣的:(go)