1.贴吧爬虫
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
"time"
)
func HttpGet ( url string ) ( result string , err error ) {
resp, err1 := http. Get ( url)
if err1 != nil {
err = err1
return
}
defer resp. Body. Close ( )
time. Sleep ( time. Second)
buf := make ( [ ] byte , 4096 )
for {
n, err2 := resp. Body. Read ( buf)
if n == 0 {
fmt. Println ( "读取网页完成" )
break
}
if err2 != nil && err2 != io. EOF {
err = err2
return
}
result += string ( buf[ : n] )
}
return
}
func working ( start, end int ) {
fmt. Printf ( "正在爬取第%d页到%d页....\n" , start, end)
for i := start; i <= end; i++ {
url := "https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=" + strconv. Itoa ( ( i- 1 ) * 50 )
result, err := HttpGet ( url)
if err != nil {
fmt. Println ( "HttpGet err:" , err)
continue
}
f, err := os. Create ( "第 " + strconv. Itoa ( i) + " 页" + ".html" )
if err != nil {
fmt. Println ( "Create err:" , err)
continue
}
f. WriteString ( result)
f. Close ( )
}
}
func main01 ( ) {
var start, end int
fmt. Print ( "请输入爬取的起始页(>=1):" )
fmt. Scan ( & start)
fmt. Print ( "请输入爬取的终止页(>=start):" )
fmt. Scan ( & end)
working ( start, end)
}
2.贴吧并发式爬虫
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
"time"
)
func HttpGet2 ( url string ) ( result string , err error ) {
resp, err1 := http. Get ( url)
if err1 != nil {
err = err1
return
}
defer resp. Body. Close ( )
time. Sleep ( time. Second)
buf := make ( [ ] byte , 4096 )
for {
n, err2 := resp. Body. Read ( buf)
if n == 0 {
fmt. Println ( "读取网页完成" )
break
}
if err2 != nil && err2 != io. EOF {
err = err2
return
}
result += string ( buf[ : n] )
}
return
}
func SpiderPage ( i int , page chan int ) {
url := "https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=" + strconv. Itoa ( ( i- 1 ) * 50 )
result, err := HttpGet2 ( url)
if err != nil {
fmt. Println ( "HttpGet err:" , err)
return
}
f, err := os. Create ( "第 " + strconv. Itoa ( i) + " 页" + ".html" )
if err != nil {
fmt. Println ( "Create err:" , err)
return
}
f. WriteString ( result)
f. Close ( )
page <- i
}
func working2 ( start, end int ) {
fmt. Printf ( "正在爬取第%d页到%d页....\n" , start, end)
page := make ( chan int )
for i := start; i <= end; i++ {
go SpiderPage ( i, page)
}
for i := start; i <= end; i++ {
fmt. Printf ( "第 %d 个页面爬取完成\n" , <- page)
}
}
func main02 ( ) {
var start, end int
fmt. Print ( "请输入爬取的起始页(>=1):" )
fmt. Scan ( & start)
fmt. Print ( "请输入爬取的终止页(>=start):" )
fmt. Scan ( & end)
working2 ( start, end)
}
3.正则-字符
package main
import (
"fmt"
"regexp"
)
func main ( ) {
str := "abc a7c mfc cat aMc azc cba"
ret := regexp. MustCompile ( `a[^0-9a-z]c` )
alls := ret. FindAllStringSubmatch ( str, - 1 )
fmt. Println ( "alls:" , alls)
}
4.正则-小数
package main
import (
"fmt"
"regexp"
)
func main04 ( ) {
str := "3.14 123.123 .68 haha 1.0 abc 7. ab.3 66.6 123."
ret := regexp. MustCompile ( `\d\.\d` )
alls := ret. FindAllStringSubmatch ( str, - 1 )
fmt. Println ( "alls:" , alls)
}
5.正则-标签
package main
import (
"fmt"
"regexp"
)
func main05 ( ) {
str := `
Go语言标准库文档中文版 | Go语言中文网 | Golang中文社区 | Golang中国
hello regexp
hello 2
hello 890
hello 664
2块钱啥时候还?
过了年再说吧!
刚买了车,没钱。。。
身体
`
ret := regexp. MustCompile ( `(?s:(.*?))
` )
alls := ret. FindAllStringSubmatch ( str, - 1 )
fmt. Println ( "alls:" , alls)
for _ , one := range alls {
fmt. Println ( "one[0]=" , one[ 0 ] )
fmt. Println ( "one[1]=" , one[ 1 ] )
}
}
6.并发提取爬虫
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
)
func HttpGetDB ( url string ) ( result string , err error ) {
resp, err1 := http. Get ( url)
if err1 != nil {
err = err1
return
}
defer resp. Body. Close ( )
buf := make ( [ ] byte , 4096 )
for {
n, err2 := resp. Body. Read ( buf)
if n == 0 {
break
}
if err2 != nil && err2 != io. EOF {
err = err2
return
}
result += string ( buf[ : n] )
}
return
}
func Save2file ( idx int , filmName, filmScore, peopleNum [ ] [ ] string ) {
path := "C:/itcast/" + "第 " + strconv. Itoa ( idx) + " 页.txt"
f, err := os. Create ( path)
if err != nil {
fmt. Println ( "os.Create err:" , err)
return
}
defer f. Close ( )
n := len ( filmName)
f. WriteString ( "电影名称" + "\t\t\t" + "评分" + "\t\t" + "评分人数" + "\n" )
for i := 0 ; i < n; i++ {
f. WriteString ( filmName[ i] [ 1 ] + "\t\t\t" + filmScore[ i] [ 1 ] + "\t\t" + peopleNum[ i] [ 1 ] + "\n" )
}
}
func SpiderPageDB ( idx int , page chan int ) {
url := "https://movie.douban.com/top250?start=" + strconv. Itoa ( ( idx- 1 ) * 25 ) + "&filter="
result, err := HttpGetDB ( url)
if err != nil {
fmt. Println ( "HttpGet2 err:" , err)
return
}
ret1 := regexp. MustCompile ( `)
filmName := ret1. FindAllStringSubmatch ( result, - 1 )
pattern := `(?s:(.*?)) `
ret2 := regexp. MustCompile ( pattern)
filmScore := ret2. FindAllStringSubmatch ( result, - 1 )
ret3 := regexp. MustCompile ( `(?s:(\d*?))人评价 ` )
peopleNum := ret3. FindAllStringSubmatch ( result, - 1 )
Save2file ( idx, filmName, filmScore, peopleNum)
page <- idx
}
func toWork ( start, end int ) {
fmt. Printf ( "正在爬取 %d 到 %d 页...\n" , start, end)
page := make ( chan int )
for i := start; i <= end; i++ {
go SpiderPageDB ( i, page)
}
for i := start; i <= end; i++ {
fmt. Printf ( "第 %d 页爬取完毕\n" , <- page)
}
}
func main06 ( ) {
var start, end int
fmt. Print ( "请输入爬取的起始页(>=1):" )
fmt. Scan ( & start)
fmt. Print ( "请输入爬取的终止页(>=start):" )
fmt. Scan ( & end)
toWork ( start, end)
}