涉及问题
- 各种包 例如:golang.org/x/net, golang/x/test 等的下载问题,其实 golang 在 github 上建立了一个镜像库,如 https://github.com/golang/net 即是 https://golang.org/x/net 的镜像库。
- golang 与数据库交互
- golang 文件读写
- golang 多线程使用
- golang 字符编码的转换
- js 标签选择器
系列文章 https://www.cnblogs.com/majianguo/p/8146110.html
1). 一个controller 可以定义多个OnHTML 回调函数 2). "div > p" --> div 的直接子元素, "div p" --> div 下的所有子元素
参考 : https://blog.csdn.net/webxscan/article/details/70174658
import (
"database/sql"
_ "github.com/go-sql-driver/mysql"
)
//数据库配置
const (
userName = "root"
password = ""
ip = "127.0.0.1"
port = "3306"
dbName = "dbName"
)
path := strings.Join([]string{userName, ":", password, "@tcp(",ip, ":", port, ")/", dbName, "?charset=utf8"}, "")
DB, _ := sql.Open("mysql", path)
//验证连接
if errConn := DB.Ping(); errConn != nil{
fmt.Println("open database fail")
return
}
fmt.Println("connnect success")
defer DB.Close()
stmt, err := DB.Prepare("insert into user(name age) values(?, ?)")
if err != nil {
fmt.Println(err)
}
res, err := stmt.Exec("username", 18)
if err != nil {
fmt.Println(err)
}
// 获取新插入行的id
fmt.Println(res.LastInsertId())
var fileName = "./flag.txt"
var file *os.File
var err error
func main() {
file = openFile(fileName)
writeFile(file, "keep coding!!")
}
func openFile(fileName string) *os.File {
if checkFileIsExist(fileName) {
//如果文件存在
file, err = os.OpenFile(fileName, os.O_APPEND, 0666)
} else {
//创建文件
file, err = os.Create(fileName)
}
check(err)
return file
}
func writeFile(file *os.File, content string) {
writer := bufio.NewWriter(file)
writer.WriteString(content)
writer.WriteString("\r\n")
writer.Flush()
}
func check(e error) {
if e != nil {
panic(e)
}
}
func checkFileIsExist(filename string) bool {
var exist = true
if _, err := os.Stat(filename); os.IsNotExist(err) {
exist = false
}
return exist
}
抓取链家网数据时,网页本身是utf8编码,没有问题,可是抓取房天下的数据时,网页本身是gb2312,折腾了好一会, F**K
//src为要转换的字符串
func coverGBKToUTF8(src string) string {
// 网上搜有说要调用translate函数的,实测不用
return mahonia.NewDecoder("gbk").ConvertString(src)
}
附上测试代码,便于理解
func main() {
// 我们还可以创建一个带缓冲的channel:
//c := make(chan int, 1024)
// 从带缓冲的channel中读数据
//for i:= range c {
//}
//此时,创建一个大小为1024的int类型的channel,即使没有读取方,写入方也可以一直往channel里写入,在缓冲区被填完之前都不会阻塞。
chs := make([] chan int, 10)
for i := 0; i < 10; i ++ {
go func(i int) {
chs[i] = make(chan int)
count(chs[i], i)
//fmt.Println("run thread ", i) // 打印要放在向信道发消息之前
}(i)
}
for _, ch := range chs {
value := <- ch
close(ch)
fmt.Println(value, " thread done")
}
fmt.Println("All done")
}
func count(ch chan int, i int) {
ch <- i // 向信道发消息的过程一定要放在协程内部,才不会被主进程阻塞
}
##最后,附上抓取放天下小区信息的源代码
package main
import (
"database/sql"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/axgle/mahonia"
_ "github.com/go-sql-driver/mysql"
"github.com/gocolly/colly"
"reflect"
"strconv"
"strings"
)
//数据库配置
const (
userName = "root"
password = ""
ip = "127.0.0.1"
port = "3306"
dbName = "rixin"
)
type XQinfo struct {
name string
addr string
area string
price string
imgs string
post_code string
property_right string
property_type string
build_time int
developer string
build_type string
build_area string
build_struct string
floor_space string
house_count string
manager_company string
green_ratio string
plot_ratio string
proprety_fee string
additional_info string
water_supply string
heat_supply string
elec_supply string
gas string
security string
environment string
parking_space string
other_info string
district string // 区
}
var flagCh = make(chan int)
var infos = make([]XQinfo, 0)
var count = 1
var page = 10
func main() {
path := strings.Join([]string{userName, ":", password, "@tcp(",ip, ":", port, ")/", dbName, "?charset=utf8"}, "")
max := 10
DB, _ := sql.Open("mysql", path)
//验证连接
if errConn := DB.Ping(); errConn != nil{
fmt.Println("open database fail")
return
}
fmt.Println("connnect success")
defer DB.Close()
for i := 0; i < max; i ++ {
for j := 1; j <= page; j ++ {
link := "https://tj.esf.fang.com/housing/__0_0_0_0_" + strconv.Itoa(i * 10 + j) + "_0_0_0/"
go work(link, i * 10 + j)
}
}
for {
<- flagCh
count++
if count < max * page {
fmt.Println("<- receive the " + strconv.Itoa(count) + " thread ending flag")
} else {
for _, info := range infos{
insertDB(DB, info)
}
fmt.Println("insert DB == ", len(infos))
break
}
}
fmt.Println("All "+ strconv.Itoa(count) + " has done")
}
func work(url string, page int) {
c := colly.NewCollector()
// 小区列表页
c.OnHTML("body", func(e *colly.HTMLElement) {
e.DOM.Find(".plotListwrap").Each(func(i int, selection *goquery.Selection) {
info := XQinfo{}
link, _ := selection.Find(".plotListwrap > dt > a").Attr("href")
// 行政区名称
//district := selection.Find(".plotListwrap > dd > p:nth-child(2) > a:nth-child(1)").Text();
//info.district = district
detailVisit(info, link)
})
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
c.OnScraped(func(response *colly.Response) {
fmt.Println("the " + strconv.Itoa(page) + " thread sending end flag ->")
flagCh <- 1
})
c.Visit(url)
}
func detailVisit(info XQinfo, link string) {
detailLink := colly.NewCollector()
detailController := detailLink.Clone()
detailLink.OnHTML("body", func(e *colly.HTMLElement) {
// 详情链接
link1, _ := e.DOM.Find("#kesfxqxq_A01_03_01 > a").Attr("href")
// 图片地址
img, _ := e.DOM.Find(".bannerbg_pos > a > img").Attr("src")
info.imgs = img
// 价格
//price := e.DOM.Find(".prib").Text()
//info.price = price + "元/㎡ "
detailController.OnHTML("body", func(e *colly.HTMLElement) {
// 小区名称
name := e.DOM.Find(".ceninfo_sq > h1 > a").Text()
info.name = coverString(name)
price := e.DOM.Find(".detaiLtop > dl:nth-child(1) > dd > span").Text()
info.price = price + "元/㎡ "
e.DOM.Find(".inforwrap").Each(func(i int, selection *goquery.Selection) {
// 模块名称
modelName := coverString(selection.Prev().Find("h3").Text())
switch modelName {
case "基本信息":
dealInfo(selection, &info)
case "配套设施":
dealInfo(selection, &info)
case "周边信息":
selection.Find("dl dt").Each(func(_ int, otherSelect *goquery.Selection) {
tab := coverString(otherSelect.Text())
del := strings.Index(tab, "本段合作")
if del == -1 {
info.other_info = info.other_info + tab + "|"
}
})
}
})
infos = append(infos, info)
})
// 访问小区详情
detailController.Visit(link1)
})
detailLink.Visit(link)
}
// 处理小区基础信息
func dealInfo(selection *goquery.Selection, info *XQinfo) {
selection.Find("dl dd").Each(func(_ int, selectionbase *goquery.Selection) {
setXQinfo(selectionbase, info)
})
selection.Find("dl dt").Each(func(_ int, selectionbase *goquery.Selection) {
setXQinfo(selectionbase, info)
})
}
func setXQinfo(selectionbase *goquery.Selection, info *XQinfo) {
orgKey := coverString(selectionbase.Find("strong").Text())
index := strings.Index(orgKey, ":")
var key string
if index > 0 {
key = orgKey[:index]
} else {
key = orgKey
}
var value string
var fullValue string
value,ok := selectionbase.Attr("title")
if ok {
value = coverString(value)
} else {
fullValue = coverString(selectionbase.Text())
value = fullValue[strings.Index(fullValue, ":") + 3:]
}
switch key {
case "小区地址":
info.addr = value
case "所属区域":
info.area = value
info.district = value[:strings.Index(value, " ")]
case "邮编":
info.post_code = value
case "产权描述":
info.property_right = value
case "物业类别":
info.property_type = value
case "建筑年代":
year,_ := strconv.Atoi(string([]rune(value)[:4]))
info.build_time = year
case "开 发 商":
info.developer = value
case "建筑结构":
info.build_struct = value
case "建筑类型":
info.build_type = value
case "建筑面积":
info.build_area = value
case "占地面积":
info.floor_space = value
case "物业公司":
info.manager_company = value
case "房屋总数":
info.house_count = value
case "绿 化 率":
info.green_ratio = value
case "容 积 率":
info.plot_ratio = value
case "物 业 费":
info.proprety_fee = value
case "附加信息":
info.additional_info = value
case "供水":
info.water_supply = value
case "供暖":
info.heat_supply = value
case "供电":
info.elec_supply = value
case "燃气":
info.gas = value
case "安全管理":
info.security = value
case "卫生服务":
info.environment = value
case "停 车 位":
info.parking_space = value
}
}
//src为要转换的字符串
func coverGBKToUTF8(src string) string {
return mahonia.NewDecoder("gbk").ConvertString(src)
}
func replaceNullHtml(src string) string {
temp := strings.Replace(src, "聽", "", -1)
temp = strings.Replace(temp, "小区网", "", -1)
return temp
}
func coverString(src string) string {
return replaceNullHtml(coverGBKToUTF8(src))
}
func insertDB(DB *sql.DB, info XQinfo) {
t := reflect.TypeOf(info)
v := reflect.ValueOf(info)
sql1 := "insert into rx_xiaoqu_more("
sql2 := ") values ("
sql3 := ")"
for i := 0; i < t.NumField(); i++ {
sql1 = sql1 + t.Field(i).Name
if t.Field(i).Name == "build_time" {
sql2 = fmt.Sprintf("%s%s%d%s", sql2, "'", v.Field(i), "'")
} else {
sql2 = fmt.Sprintf("%s%s%s%s", sql2, "'", v.Field(i), "'")
}
if i != t.NumField() - 1 {
sql1 = sql1 + ", "
sql2 = sql2 + ", "
}
}
stmt, err := DB.Prepare(sql1 + sql2 + sql3)
if err != nil {
fmt.Println(sql1 + sql2 + sql3)
fmt.Println(err)
}
res, err := stmt.Exec()
if err != nil {
fmt.Println(sql1 + sql2 + sql3)
fmt.Println(err)
}
fmt.Println(res.LastInsertId())
}