88-爬虫爬取span信息



我们在爬取网页之后
有大量的无用的信息

所以我们需要用正则表达式去筛选一下

我们先来试试普通爬取

var channel = make(chan bool)

func main() {

	startSpider(1, 5)

}

func startSpider(start int, end int) {
	for i := start; i <= end; i++ {
		url := "https://www.qiushibaike.com/text/page/" + strconv.Itoa(i)

		filename := "f:/test/第" + strconv.Itoa(i) + "页.txt"

		go spider(url, filename)

	}

	for i := start; i <= end; i++ {
		<-channel
	}
}

func spider(url string, filename string) {
	resp, err := http.Get(url)
	if err != nil {
		fmt.Println("爬取失败...")
		return
	}
	defer resp.Body.Close()

	bytes, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		fmt.Println("读取失败...")
		return
	}

	writeErr := ioutil.WriteFile(filename, bytes, 0666)
	if writeErr != nil {
		fmt.Println("写入失败...")
		return
	}

	channel <- true
}



然后我们加上一个方法
handleBody(bytes)
用正则来匹配我们想要的东西


	bytes, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		fmt.Println("读取失败...")
		return
	}

	handleBody(bytes)


func handleBody(bytes []byte) {
	old := string(bytes)
	compile := regexp.MustCompile("(www.(.*?).com)")
	submatch := compile.FindAllStringSubmatch(old, 5)
	fmt.Println(submatch)
}


我们来查看一下一些网址

结果有

[www.qiushibaike.com www.qiushibaike.com qiushibaike] [www.qiushibaike.com www.qiushibaike.com qiushibaike] [www.app-remix.com www.app-remix.com app-remix]

所以成功找到了一些网址



然后我们再试试找一下span标签

func handleBody(bytes []byte) {
	old := string(bytes)
	compile := regexp.MustCompile("(.+?)")
	submatch := compile.FindAllStringSubmatch(old, 5)
	fmt.Println(submatch)
}

但是我们发现
匹配到的数据还是比较多而且比较杂乱

如果我们只是想要文字怎么办



var channel = make(chan bool)

func main() {

	startSpider(1, 5)

}

func startSpider(start int, end int) {
	for i := start; i <= end; i++ {
		url := "https://www.qiushibaike.com/text/page/" + strconv.Itoa(i)

		filename := "f:/test/第" + strconv.Itoa(i) + "页.txt"

		go spider(url, filename)

	}

	for i := start; i <= end; i++ {
		<-channel
	}
}

func spider(url string, filename string) {
	resp, err := http.Get(url)
	if err != nil {
		fmt.Println("爬取失败...")
		return
	}
	defer resp.Body.Close()

	bytes, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		fmt.Println("读取失败...")
		return
	}

	str := handleBody(bytes)

	writeErr := ioutil.WriteFile(filename, []byte(str), 0666)
	if writeErr != nil {
		fmt.Println("写入失败...")
		return
	}

	channel <- true
}

func handleBody(bytes []byte) string {

	//我们将一些特殊字符给替换一下
	old := string(bytes)
	old = strings.Replace(old, " ", "", -1)
	old = strings.Replace(old, "\n", "", -1)
	old = strings.Replace(old, "\r", "", -1)
	old = strings.Replace(old, "
", "", -1) //匹配正则 compile := regexp.MustCompile("(.+?)") submatch := compile.FindAllStringSubmatch(old, -1) fmt.Println(submatch) //拼接字符串 str := "" for i1, s1 := range submatch { for _, s2 := range s1 { if !strings.Contains(s2, "span") { str = str + "第" + strconv.Itoa(i1) + "条是: " + s2 + "\r\n" } } } return str }

 

你可能感兴趣的:(regexp,golang,spider)