请求报文格式分析
package main
import (
"fmt"
"net"
)
func main() {
fmt.Println("Http请求包格式演示案例")
listener, err := net.Listen("tcp", "127.0.0.1:8000") //监听
if err != nil {
fmt.Println("err:", err)
return
}
defer listener.Close() //关闭
for {
//阻塞,等待用户连接
conn, err := listener.Accept()
if err != nil {
fmt.Println("err:", err)
return
}
// 接受用户请求
buff := make([]byte, 1024)
n, err1 := conn.Read(buff)
if err1 != nil {
fmt.Println("err1:", err1)
continue
}
fmt.Println("buff = ", string(buff[:n]))
defer conn.Close() //关闭当前用户链接
}
}
浏览器访问:http://127.0.0.1:8000/
控制台输出:
GET / HTTP/1.1
Host: 127.0.0.1:8000
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9
响应报文测试方法
package main
import (
"fmt"
"net/http"
)
func myHandler(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("hello go!"))
}
func main() {
fmt.Println("响应报文格式演示案例")
http.HandleFunc("/go", myHandler)
//在指定的地址进行监听
http.ListenAndServe("127.0.0.1:8000", nil)
}
浏览器访问:http://127.0.0.1:8000/go
页面显示:hello go!
http客户端编程
访问百度
package main
import (
"fmt"
"net/http"
)
func main() {
fmt.Println("http编程演示案例")
resp, err := http.Get("https://www.baidu.com") //func Get(url string) (resp *Response, err error)
if err != nil {
fmt.Printf("err:", err)
return
}
defer resp.Body.Close()
fmt.Println("Status = ", resp.Status)
fmt.Println("StatusCode = ", resp.StatusCode)
fmt.Println("Header = ", resp.Header)
fmt.Println("Body = ", resp.Body)
}
输出结果:
Status = 200 OK
StatusCode = 200
Header = map[Accept-Ranges:[bytes] Cache-Control:[no-cache] Connection:[Keep-Alive] Content-Length:[227] Content-Type:[text/html] Date:[Wed, 20 Mar 2019 11:10:52 GMT] Etag:["5c7cdb1f-e3"] Last-Modified:[Mon, 04 Mar 2019 08:00:31 GMT] P3p:[CP=" OTI DSP COR IVA OUR IND COM "] Pragma:[no-cache] Server:[BWS/1.1] Set-Cookie:[BD_NOT_HTTPS=1; path=/; Max-Age=300 BIDUPSID=1EB8D042488157FB56779477283469A8; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com PSTM=1553080252; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com] Strict-Transport-Security:[max-age=0] X-Ua-Compatible:[IE=Edge,chrome=1]]
Body = &{0xc000034080 {0 0} false 0x60dd50 0x60dcd0}
百度贴吧爬虫
package main
import (
"fmt"
"net/http"
"os"
"strconv"
//"os"
)
func main() {
fmt.Println("百度贴吧爬虫编程演示案例")
// http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=50
// http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=100
var start, end int
fmt.Printf("输入起始页:")
fmt.Scan(&start)
fmt.Printf("输入结束页:")
fmt.Scan(&end)
doWork(start, end)
}
func doWork(start, end int) {
fmt.Printf("正在爬去 %d - %d 页的数据\n", start, end)
for i := start; i <= end; i++ {
getPage(i)
}
}
func getPage(page int) {
var url string
url = "http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=" + strconv.Itoa(page*50)
fmt.Println("url = ", url)
res, err := httpGet(url)
if err != nil {
fmt.Println("err:", err)
return
}
// fmt.Println("res:", res)
// 把内容写在文件里 1.html
fileName := strconv.Itoa(page) + ".html"
f1, err1 := os.Create(fileName)
if err1 != nil {
fmt.Println("err1:", err1)
return
}
f1.WriteString(res)
}
func httpGet(url string) (res string, err error) {
resp, err1 := http.Get(url) //func Get(url string) (resp *Response, err error)
if err1 != nil {
fmt.Println("err1:", err1)
err = err1
return
}
defer resp.Body.Close()
buff := make([]byte, 1024)
for {
n, _ := resp.Body.Read(buff) //(n int, err error)
// if err2 != nil {
// fmt.Println("err2:", err2)
// err = err2
// return
// }// 这里竟然不能这么写???
if n == 0 {
fmt.Println("读取结束:")
break
}
res += string(buff[:n])
}
return
}
并发版网络爬虫:
package main
import (
"fmt"
"net/http"
"os"
"strconv"
//"os"
)
var mPage = make(chan int)
func main() {
fmt.Println("百度贴吧爬虫编程演示案例")
// http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=50
// http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=100
var start, end int
fmt.Printf("输入起始页:")
fmt.Scan(&start)
fmt.Printf("输入结束页:")
fmt.Scan(&end)
doWork(start, end)
}
func doWork(start, end int) {
fmt.Printf("正在爬去 %d - %d 页的数据\n", start, end)
for i := start; i <= end; i++ {
go getPage(i)
}
for i := start; i <= end; i++ {
fmt.Printf("第 %d页 爬去完成\n", <-mPage)
}
}
func getPage(page int) {
var url string
url = "http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=" + strconv.Itoa(page*50)
//fmt.Println("url = ", url)
fmt.Printf("开始爬去第 %d 页,%s\n", page, url)
res, err := httpGet(url)
if err != nil {
fmt.Println("err:", err)
return
}
mPage <- page
// fmt.Println("res:", res)
// 把内容写在文件里 1.html
fileName := strconv.Itoa(page) + ".html"
f1, err1 := os.Create(fileName)
if err1 != nil {
fmt.Println("err1:", err1)
return
}
f1.WriteString(res)
}
func httpGet(url string) (res string, err error) {
resp, err1 := http.Get(url) //func Get(url string) (resp *Response, err error)
if err1 != nil {
fmt.Println("err1:", err1)
err = err1
return
}
defer resp.Body.Close()
buff := make([]byte, 1024)
for {
n, _ := resp.Body.Read(buff) //(n int, err error)
// if err2 != nil {
// fmt.Println("err2:", err2)
// err = err2
// return
// }// 这里竟然不能这么写???
if n == 0 {
//fmt.Println("读取结束")
break
}
res += string(buff[:n])
}
return
}
段子爬虫
package main
import (
"fmt"
"net/http"
"os"
"regexp"
"strconv"
//"os"
)
var mPage = make(chan int)
func main() {
fmt.Println("段子爬虫编程演示案例")
// http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=50
// http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=100
var start, end int
// fmt.Printf("输入起始页:")
// fmt.Scan(&start)
// fmt.Printf("输入结束页:")
// fmt.Scan(&end)
start = 1
end = 1
doWork(start, end)
}
func doWork(start, end int) {
fmt.Printf("正在爬去 %d - %d 页的数据\n", start, end)
for i := start; i <= end; i++ {
go getPage(i)
}
for i := start; i <= end; i++ {
fmt.Printf("第 %d页 爬去完成\n", <-mPage)
}
}
func getPage(page int) {
var url string
url = "http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=" + strconv.Itoa(page*50)
url = "https://www.pengfue.com/index_" + strconv.Itoa(page) + ".html"
//fmt.Println("url = ", url)
fmt.Printf("开始爬去第 %d 页,%s\n", page, url)
res, err := httpGet(url)
if err != nil {
fmt.Println("err:", err)
return
}
//游泳圈
reg := regexp.MustCompile(`(?s:(.*?))
`)
if reg == nil {
fmt.Println("regexp error.")
return
}
titles := reg.FindAllStringSubmatch(res, 1)
for _, title := range titles {
fmt.Println("title:", title[1]) //this is title.
// 把内容存储到文件中
fileTitle = append(fileTitle, title[1])
}
// 处理content
regContent := regexp.MustCompile(`(?s:(.*?))
还需要处理title和content中包含的特殊格式。
并发的爬虫:
package main
import (
"fmt"
"net/http"
"os"
"regexp"
"strconv"
"strings"
//"os"
)
var mPage = make(chan int)
func main() {
fmt.Println("---段子爬虫编程演示案例---")
// http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=50
// http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=100
var start, end int
fmt.Printf("输入起始页:")
fmt.Scan(&start)
fmt.Printf("输入结束页:")
fmt.Scan(&end)
// start = 1
// end = 1
doWork(start, end)
}
func doWork(start, end int) {
fmt.Printf("正在爬去 %d - %d 页的数据...\n", start, end)
for i := start; i <= end; i++ {
go getPage(i)
}
for i := start; i <= end; i++ {
fmt.Printf("第【%d】页 爬取完成\n", <-mPage)
}
}
func getPage(page int) {
var url string
url = "http://tieba.baidu.com/f?kw=go%E8%AF%AD%E8%A8%80&ie=utf-8&pn=" + strconv.Itoa(page*50)
url = "https://www.pengfue.com/index_" + strconv.Itoa(page) + ".html"
//fmt.Println("url = ", url)
fmt.Printf("开始爬去第【%d】页:%s\n", page, url)
res, err := httpGet(url)
if err != nil {
fmt.Println("err:", err)
return
}
//游泳圈
reg := regexp.MustCompile(`(?s:(.*?))
`)
if reg == nil {
fmt.Println("regexp error.")
return
}
titles := reg.FindAllStringSubmatch(res, 1)
var tempTitle string //用于处理title
var tempContent string //用于处理content
for _, title := range titles {
// 把内容存储到文件中
tempTitle = title[1]
tempTitle = strings.Replace(tempTitle, "\r", "", -1)
tempTitle = strings.Replace(tempTitle, "\n", "", -1)
tempTitle = strings.Replace(tempTitle, " ", "", -1)
tempTitle = strings.Replace(tempTitle, "\t", "", -1)
fmt.Println("title:", tempTitle) //this is title.
fileTitle = append(fileTitle, tempTitle)
}
// 处理content
regContent := regexp.MustCompile(`(?s:(.*?))
END.