首次 golang爬虫插件gocolly/colly 使用经历

涉及问题

  • 各种包 例如:golang.org/x/net, golang/x/test 等的下载问题,其实 golang 在 github 上建立了一个镜像库,如 https://github.com/golang/net 即是 https://golang.org/x/net 的镜像库。
  • golang 与数据库交互
  • golang 文件读写
  • golang 多线程使用
  • golang 字符编码的转换
  • js 标签选择器

参考

系列文章 https://www.cnblogs.com/majianguo/p/8146110.html

start

js 选择器的问题

1). 一个controller 可以定义多个OnHTML 回调函数 2). "div > p" --> div 的直接子元素, "div p" --> div 下的所有子元素

golang数据库连接问题

参考 : https://blog.csdn.net/webxscan/article/details/70174658

import (
    "database/sql"
    _ "github.com/go-sql-driver/mysql"
)

//数据库配置
const (
    userName    = "root"
    password    = ""
    ip          = "127.0.0.1"
    port        = "3306"
    dbName      = "dbName"
)

path := strings.Join([]string{userName, ":", password, "@tcp(",ip, ":", port, ")/", dbName, "?charset=utf8"}, "")

DB, _ := sql.Open("mysql", path)
//验证连接
if errConn := DB.Ping(); errConn != nil{
    fmt.Println("open database fail")
    return
}
fmt.Println("connnect success")
defer DB.Close()

stmt, err := DB.Prepare("insert into user(name age) values(?, ?)")
if err != nil {
    fmt.Println(err)
}
res, err := stmt.Exec("username", 18)
if err != nil {
    fmt.Println(err)
}
// 获取新插入行的id
fmt.Println(res.LastInsertId())

golang文件写入

var fileName = "./flag.txt"
var file *os.File
var err error

func main() {
    file = openFile(fileName)
    writeFile(file, "keep coding!!")
}

func openFile(fileName string) *os.File {
    if checkFileIsExist(fileName) {
        //如果文件存在
        file, err = os.OpenFile(fileName, os.O_APPEND, 0666)
    } else {
        //创建文件
        file, err = os.Create(fileName)
    }
    check(err)
    return file
}

func writeFile(file *os.File, content string) {
    writer := bufio.NewWriter(file)
    writer.WriteString(content)
    writer.WriteString("\r\n")
    writer.Flush()
}

func check(e error) {
    if e != nil {
        panic(e)
    }
}

func checkFileIsExist(filename string) bool {
    var exist = true
    if _, err := os.Stat(filename); os.IsNotExist(err) {
        exist = false
    }
    return exist
}

golang字符编码的转换

抓取链家网数据时,网页本身是utf8编码,没有问题,可是抓取房天下的数据时,网页本身是gb2312,折腾了好一会, F**K

//src为要转换的字符串
func coverGBKToUTF8(src string) string {
    // 网上搜有说要调用translate函数的,实测不用
    return mahonia.NewDecoder("gbk").ConvertString(src)
}

golang多线程,控制消息同步

附上测试代码,便于理解

func main() {
    // 我们还可以创建一个带缓冲的channel:
    //c := make(chan int, 1024)
    // 从带缓冲的channel中读数据
    //for i:= range c {
    //}
    //此时,创建一个大小为1024的int类型的channel,即使没有读取方,写入方也可以一直往channel里写入,在缓冲区被填完之前都不会阻塞。

    chs := make([] chan int, 10)
    for i := 0; i < 10; i ++ {
       go func(i int) {
           chs[i] = make(chan int)
           count(chs[i], i)
           //fmt.Println("run thread ", i) // 打印要放在向信道发消息之前
       }(i)
    }

    for _, ch := range chs {
       value := <- ch
       close(ch)
       fmt.Println(value, " thread done")
    }
    fmt.Println("All done")
}

func count(ch chan int, i int) {
    ch <- i // 向信道发消息的过程一定要放在协程内部,才不会被主进程阻塞
}

##最后,附上抓取放天下小区信息的源代码

package main

import (
    "database/sql"
    "fmt"
    "github.com/PuerkitoBio/goquery"
    "github.com/axgle/mahonia"
    _ "github.com/go-sql-driver/mysql"
    "github.com/gocolly/colly"
    "reflect"
    "strconv"
    "strings"
)

//数据库配置
const (
    userName    = "root"
    password    = ""
    ip          = "127.0.0.1"
    port        = "3306"
    dbName      = "rixin"
)


type XQinfo struct {
    name                string
    addr                string
    area                string
    price               string
    imgs                string
    post_code           string
    property_right      string
    property_type       string
    build_time          int
    developer           string
    build_type          string
    build_area          string
    build_struct        string
    floor_space         string
    house_count         string
    manager_company     string
    green_ratio         string
    plot_ratio          string
    proprety_fee        string
    additional_info     string
    water_supply        string
    heat_supply         string
    elec_supply         string
    gas                 string
    security            string
    environment         string
    parking_space       string
    other_info          string
    district            string      // 区
}

var flagCh      = make(chan  int)
var infos       = make([]XQinfo, 0)
var count       = 1
var page        = 10

func main() {
    path    := strings.Join([]string{userName, ":", password, "@tcp(",ip, ":", port, ")/", dbName, "?charset=utf8"}, "")
    max     := 10

    DB, _   := sql.Open("mysql", path)
    //验证连接
    if errConn := DB.Ping(); errConn != nil{
        fmt.Println("open database fail")
        return
    }
    fmt.Println("connnect success")
    defer DB.Close()

    for i := 0; i < max; i ++  {
        for j := 1; j <= page; j ++ {

            link := "https://tj.esf.fang.com/housing/__0_0_0_0_" + strconv.Itoa(i * 10 + j) + "_0_0_0/"
            go work(link, i * 10 + j)
        }
    }

    for {
        <- flagCh
        count++
        if count < max * page {
            fmt.Println("<- receive the " + strconv.Itoa(count) + " thread ending flag")
        } else {
            for _, info := range infos{
               insertDB(DB, info)
            }
            fmt.Println("insert DB == ", len(infos))
            break
        }
    }
    fmt.Println("All "+ strconv.Itoa(count) + " has done")
}

func work(url string, page int) {
    c := colly.NewCollector()
    // 小区列表页
    c.OnHTML("body", func(e *colly.HTMLElement) {

        e.DOM.Find(".plotListwrap").Each(func(i int, selection *goquery.Selection) {
            info        := XQinfo{}
            link, _     := selection.Find(".plotListwrap > dt > a").Attr("href")
            // 行政区名称
            //district    := selection.Find(".plotListwrap > dd > p:nth-child(2) > a:nth-child(1)").Text();
            //info.district = district

            detailVisit(info, link)
        })
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL.String())
    })

    c.OnScraped(func(response *colly.Response) {

        fmt.Println("the " + strconv.Itoa(page) + " thread sending end flag ->")

        flagCh <- 1
    })

    c.Visit(url)
}

func detailVisit(info XQinfo, link string) {
    detailLink          := colly.NewCollector()
    detailController    := detailLink.Clone()

    detailLink.OnHTML("body", func(e *colly.HTMLElement) {

        // 详情链接
        link1, _    := e.DOM.Find("#kesfxqxq_A01_03_01 > a").Attr("href")
        // 图片地址
        img, _      := e.DOM.Find(".bannerbg_pos > a > img").Attr("src")
        info.imgs   = img
        // 价格
        //price       := e.DOM.Find(".prib").Text()
        //info.price  = price + "元/㎡	"


        detailController.OnHTML("body", func(e *colly.HTMLElement) {

            // 小区名称
            name        := e.DOM.Find(".ceninfo_sq > h1 > a").Text()
            info.name   = coverString(name)

            price       := e.DOM.Find(".detaiLtop > dl:nth-child(1) > dd > span").Text()
            info.price  = price + "元/㎡	"


            e.DOM.Find(".inforwrap").Each(func(i int, selection *goquery.Selection) {

                // 模块名称
                modelName := coverString(selection.Prev().Find("h3").Text())

                switch modelName {
                case "基本信息":
                    dealInfo(selection, &info)
                case "配套设施":
                    dealInfo(selection, &info)
                case "周边信息":
                    selection.Find("dl dt").Each(func(_ int, otherSelect *goquery.Selection) {
                        tab := coverString(otherSelect.Text())
                        del := strings.Index(tab, "本段合作")
                        if del == -1 {
                            info.other_info = info.other_info + tab + "|"
                        }
                    })
                }
            })

            infos = append(infos, info)
        })

        // 访问小区详情
        detailController.Visit(link1)
    })

    detailLink.Visit(link)
}

// 处理小区基础信息
func dealInfo(selection *goquery.Selection, info *XQinfo) {
    selection.Find("dl dd").Each(func(_ int, selectionbase *goquery.Selection) {
        setXQinfo(selectionbase, info)
    })

    selection.Find("dl dt").Each(func(_ int, selectionbase *goquery.Selection) {
        setXQinfo(selectionbase, info)
    })
}

func setXQinfo(selectionbase *goquery.Selection, info *XQinfo) {

    orgKey  := coverString(selectionbase.Find("strong").Text())
    index   := strings.Index(orgKey, ":")

    var key string
    if index > 0 {
        key = orgKey[:index]
    } else {
        key = orgKey
    }

    var value string
    var fullValue string
    value,ok := selectionbase.Attr("title")
    if ok {
        value = coverString(value)
    } else {
        fullValue   = coverString(selectionbase.Text())
        value       = fullValue[strings.Index(fullValue, ":") + 3:]
    }

    switch key {
    case "小区地址":
        info.addr           = value
    case "所属区域":
        info.area           = value
        info.district       = value[:strings.Index(value, " ")]
    case "邮编":
        info.post_code       = value
    case "产权描述":
        info.property_right  = value
    case "物业类别":
        info.property_type   = value
    case "建筑年代":
        year,_              := strconv.Atoi(string([]rune(value)[:4]))
        info.build_time      = year
    case "开 发 商":
        info.developer      = value
    case "建筑结构":
        info.build_struct    = value
    case "建筑类型":
        info.build_type      = value
    case "建筑面积":
        info.build_area      = value
    case "占地面积":
        info.floor_space     = value
    case "物业公司":
        info.manager_company  = value
    case "房屋总数":
        info.house_count     = value
    case "绿 化 率":
        info.green_ratio     = value
    case "容 积 率":
        info.plot_ratio      = value
    case "物 业 费":
        info.proprety_fee    = value
    case "附加信息":
        info.additional_info = value
    case "供水":
        info.water_supply    = value
    case "供暖":
        info.heat_supply     = value
    case "供电":
        info.elec_supply     = value
    case "燃气":
        info.gas            = value
    case "安全管理":
        info.security       = value
    case "卫生服务":
        info.environment    = value
    case "停 车 位":
        info.parking_space   = value
    }
}

//src为要转换的字符串
func coverGBKToUTF8(src string) string {
    return mahonia.NewDecoder("gbk").ConvertString(src)
}

func replaceNullHtml(src string) string {
    temp := strings.Replace(src, "聽", "", -1)
    temp = strings.Replace(temp, "小区网", "", -1)
    return temp
}

func coverString(src string) string {
    return replaceNullHtml(coverGBKToUTF8(src))
}


func insertDB(DB *sql.DB, info XQinfo) {
    t := reflect.TypeOf(info)
    v := reflect.ValueOf(info)

    sql1 := "insert into rx_xiaoqu_more("
    sql2 := ") values ("
    sql3 := ")"

    for i := 0; i < t.NumField(); i++ {

        sql1 = sql1 + t.Field(i).Name

        if t.Field(i).Name == "build_time"  {
            sql2 = fmt.Sprintf("%s%s%d%s", sql2, "'", v.Field(i), "'")
        } else {
            sql2 = fmt.Sprintf("%s%s%s%s", sql2, "'", v.Field(i), "'")
        }

        if i != t.NumField() - 1 {
            sql1 = sql1 + ", "
            sql2 = sql2 + ", "
        }
    }

    stmt, err := DB.Prepare(sql1 + sql2 + sql3)
    if err != nil {
        fmt.Println(sql1 + sql2 + sql3)
        fmt.Println(err)
    }
    res, err := stmt.Exec()
    if err != nil {
        fmt.Println(sql1 + sql2 + sql3)
        fmt.Println(err)
    }
    fmt.Println(res.LastInsertId())
}

你可能感兴趣的:(golang)