爬虫案例(golang)

package main

import (
"crypto/md5"
"database/sql"
"encoding/hex"
"fmt"
"strconv"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
_ "github.com/go-sql-driver/mysql"

)

func main() {
Chaindd()
}

func Chaindd() {
url := "http://www.chaindd.com/"
doc, _ := goquery.NewDocument(url)

doc.Find(".article-list li").Not("li:last-child").Each(func(i int, s *goquery.Selection) {
    db, _ := sql.Open("mysql", "root:123456@/news?charset=utf8")

    //链得得图片链接
    chain_img, _ := s.Find(".pic img").Attr("src")
    // fmt.Println(chain_img)

    //链得得图片标题
    chain_tit := s.Find(".cont h3 .title").Text()
    chain_tit_biao := strings.Replace(chain_tit, ",", " ", -1)
    chain_tit_ok_str_ok := strings.Replace(chain_tit_biao, " ", "", -1)

    //存入ES唯一ID
    h := md5.New()
    h.Write([]byte(chain_tit)) // 需要加密的字符串为 123456
    cipherStr := h.Sum(nil)
    result_tit_ok_md5 := hex.EncodeToString(cipherStr) // 输出加密结果
    EarTit_md_ok := result_tit_ok_md5[0:10]

    //获取当前时间戳
    currentTime := time.Now().Unix()
    currentTime_str := strconv.FormatInt(currentTime, 10)
    currentTime_ok := currentTime_str[6:]

    //拼接ES唯一ID
    only_id := EarTit_md_ok + currentTime_ok
    // fmt.Println(chain_tit)

    //链得得摘要
    chain_main := s.Find(".cont .summary").Text()
    chain_main_ok := strings.Replace(chain_main, "\t", " ", -1)
    chain_main_ok_str := strings.Replace(chain_main_ok, "\n", " ", -1)
    chain_main_ok_strok := strings.Replace(chain_main_ok_str, ""+`"`+"", "'", -1)

    //文章来源
    chain_Addr := s.Find(".cont .info .author .name").Text()
    chain_Addr_ok := strings.Replace(chain_Addr, chain_Addr, "链得得", 1)
    // fmt.Println(chain_Addr)

    //文章链接
    chain_href, _ := s.Find(".cont h3 a").Attr("href")
    //拼接请求
    chain_href_ok := "http://www.chaindd.com" + chain_href
    // fmt.Println(chain_href_ok)
    docs, _ := goquery.NewDocument(chain_href_ok)
    // fmt.Println(docs)

    //发布时间
    chain_time := docs.Find("article .authors .time").Text() + ":00"
    // fmt.Println(chain_time)
    timeLayout := "2006-01-02 15:04:05"                             //转化所需模板
    loc, _ := time.LoadLocation("Local")                            //重要:获取时区
    theTime, _ := time.ParseInLocation(timeLayout, chain_time, loc) //使用模板在对应时区转化为time.time类型
    sr := theTime.Unix()
    time := strconv.FormatInt(sr, 10)

    //文章内容
    chain_con := docs.Find(".inner").Text()
    chain_con_ok := strings.Replace(chain_con, "\t", " ", -1)
    chain_con_ok_str := strings.Replace(chain_con_ok, "\n", "/n", -1)
    chain_con_ok_strok := strings.Replace(chain_con_ok_str, ""+`"`+"", "'", -1)

    //发送post请求
    // urls := "http://192.168.31.185:9200/bytenews/document/" + only_id + ""
    // post := "{\"abstract\":\"" + chain_main_ok_strok + "\",\"category\":\"" + "" + "\",\"content\":\"" + chain_con_ok_strok + "\",\"purl\":\"" + chain_img + "\",\"source\":\"" + chain_Addr_ok + "\",\"timestamp\":\"" + time + "\",\"title\":\"" + chain_tit + "\",\"url\":\"" + chain_href_ok + "\"}"
    // // fmt.Println(post)
    // var jsonStr = []byte(post)
    // req, err := http.NewRequest("PUT", urls, bytes.NewBuffer(jsonStr))
    // req.Header.Set("Content-Type", "application/json;charset:utf-8")
    // client := &http.Client{}
    // resp, err := client.Do(req)
    // if err != nil {
    //  panic(err)
    // }
    // defer resp.Body.Close()
    // body, _ := ioutil.ReadAll(resp.Body)
    // fmt.Println(string(body))

    stmt, err := db.Prepare("REPLACE newsc SET onlyid=?,title=?,imgUrl=?,content=?,main=?,addr=?,href=?,time=?")
    checkErr(err)
    res, err := stmt.Exec(only_id, chain_tit_ok_str_ok, chain_img, chain_main_ok_strok, chain_con_ok_strok, chain_Addr_ok, chain_href_ok, time)
    checkErr(err)
    id, err := res.LastInsertId()
    checkErr(err)
    fmt.Println(id)

})

}

func checkErr(err error) {
if err != nil {
panic(err)
}
}

你可能感兴趣的:(爬虫案例(golang))