golang:封装 google search engine

简介

破解 google 搜索接口,进行代码自动化接入 google engine,而不是通过 web driver方式,不占服务器太多带宽
利用 proto 文件作为配置结构,请自行利用 protoc 编译成对于的 pb.go 文件,配置文件采用 yaml,请自行实现读取配置关联

目录结构

image.png

代码

  • conf.proto
syntax = "proto3";
package kxconfig;

option go_package = "ggsearchengine";
import "google/protobuf/struct.proto";
import "google/protobuf/duration.proto";

message Bootstrap {
  Client client = 5;
}

message Client {
  message HTTP {
    google.protobuf.Duration timeout = 1;
    optional string proxy = 2;
    int32 max_idle_conns = 3;
    int32 max_conns_per_host = 4;
    int32 max_idle_conns_per_host = 5;
    google.protobuf.Duration proxy_timeout = 6;
    optional string premium_proxy = 7;
    google.protobuf.Duration premium_proxy_timeout = 8;
  }
  HTTP http = 1;
}
  • config.yaml
client:
  http:
    timeout: 10s
    max_idle_conns: 10
    max_conns_per_host: 100
    max_idle_conns_per_host: 10
    proxy: 127.0.0.1n:8118
    premium_proxy: 127.0.0.1n:8228
    proxy_timeout: 10s
    premium_proxy_timeout: 10s
  • google_search_engine.go
package ggsearchengine

import (
    "bytes"
    "context"
    "fmt"
    "io/ioutil"
    "net/http"
    "regexp"
    "strings"
    "time"

    ggenerator "ggsearch_engine/ggsearchengine/useragent"
    "github.com/PuerkitoBio/goquery"
    "github.com/go-kratos/kratos/v2/errors"
    "github.com/go-kratos/kratos/v2/log"
)

var (
    baseUrl = "https://www.google.com/search"
    Err     = errors.New(400, "GOOGLE_HTTP_BAD_CODE", "google search http bad code")
)

// search engine 属性不能包含每次搜索变化的变量, search engine 属性是全局唯一的
type googleSearchEngine struct {
    log          *log.Helper
    nProxyClient *NoPoolProxyHttpClient
}

func NewGoogleSearchEngine(logger log.Logger, nProxyClient *NoPoolProxyHttpClient) SearchEngine {
    return &googleSearchEngine{
        log:          log.NewHelper(logger),
        nProxyClient: nProxyClient,
    }
}

func WithMonitor(metricKey string) Option {
    return func(o *options) {
        o.metricKey = &metricKey
    }
}

func WithProxy(isFreeProxy bool) Option {
    return func(o *options) {
        o.isFreeProxy = isFreeProxy
    }
}

func WithTextExtract(textExtractionSyntax string, textExtractionIndex []int) Option {
    return func(o *options) {
        o.textExtractionSyntax = &textExtractionSyntax
        o.textExtractionIndex = textExtractionIndex
    }
}

func (g *googleSearchEngine) textExtract(ctx context.Context, text string, extractSyntax string, extractIndexes []int) []string {
    regx := regexp.MustCompile(extractSyntax)
    extractContents := regx.FindStringSubmatch(text)
    // 按照提取顺序返回
    result := make([]string, len(extractIndexes))
    for i, extractIndex := range extractIndexes {
        if len(extractContents) < extractIndex+1 {
            continue
        } else {
            result[i] = extractContents[extractIndex]
        }
    }
    return result
}

func (g *googleSearchEngine) parseGoogleResponse(ctx context.Context, o *options, r []byte) (results []*SearchResult, err error) {
    doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r))
    if err != nil {
        return nil, err
    }
    results = make([]*SearchResult, 0)
    doc.Find(".xpd").Has("a>h3").Each(func(i int, s *goquery.Selection) {
        var (
            title   string
            link    string
            content string
        )
        result := &SearchResult{}
        // 返回的数组是提取的多个关键字部分
        result.Content = make([]string, 0)
        // 标题
        title = s.Find("a>h3").First().Text()
        // 内容
        contents := make([]string, 0)
        // 提取解析内容
        s.Find("div").Each(func(i int, selection *goquery.Selection) {
            contents = append(contents, selection.Text())
        })
        content = strings.Join(contents, ",")
        if o.textExtractionSyntax != nil {
            extractionContent := g.textExtract(ctx, content, *o.textExtractionSyntax, o.textExtractionIndex)
            result.Content = extractionContent
        } else {
            result.Content = []string{content}
        }
        // 提取解析 link
        originLink := s.Find("a").First().AttrOr("href", "")
        regx := regexp.MustCompile(`(?U)^.*/url\?q=.*(?P.*)&sa=.*`)
        extractLink := regx.FindStringSubmatch(originLink)
        if len(extractLink) >= 2 {
            link = extractLink[1]
        }
        result.Title = title
        result.Url = link
        results = append(results, result)
    })

    return results, nil
}

func (g *googleSearchEngine) sendGoogleRequest(ctx context.Context, o *options, pageNum, pageSize uint32, query string, uule string) (ret []byte, err error) {
    params := map[string]string{
        "newwindow": "1",
        "gbv":       "1",
        "num":       fmt.Sprintf("%d", pageSize),
        "start":     fmt.Sprintf("%d", pageNum-1),
        "ie":        "UTF-8",
        "oe":        "UTF-8",
        "hl":        "en-US",
        "q":         query,
    }
    //uule为空表示搜全球
    if uule != "" {
        params["uule"] = uule
    }
    req, err := newHttpGetRequest(baseUrl, params)
    if err != nil {
        return nil, err
    }
    headers := []string{ggenerator.USER_AGENT}
    headerMap := ggenerator.HeaderGenerator(headers)
    for headerKey, headerValue := range headerMap {
        req.Header.Add(headerKey, headerValue)
    }
    g.log.WithContext(ctx).Infof("get google search:%s, header: %v, request parameter:%v",
        baseUrl, headerMap, params)
    begin := time.Now()
    resp, err := getHttpClient(g.nProxyClient, o.isFreeProxy).Do(req)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()
    respBody, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        return nil, err
    }
    if resp.StatusCode != http.StatusOK {
        g.log.WithContext(ctx).Infof("get google search:%s, response:%s",
            resp.Request.URL, string(respBody))
        err = Err
    }
    // 监控
    setMonitor(o.metricKey, begin, err)
    if err != nil {
        return nil, err
    }

    return respBody, nil
}

func (g *googleSearchEngine) Search(ctx context.Context, pageNum, pageSize uint32, query, uule string, opts ...Option) (results []*SearchResult, err error) {
    o := &options{
        isFreeProxy: true,
    }
    // 执行 option
    for _, opt := range opts {
        opt(o)
    }
    body, err := g.sendGoogleRequest(ctx, o, pageNum, pageSize, query, uule)
    if err != nil {
        return nil, err
    }

    results, err = g.parseGoogleResponse(ctx, o, body)
    if err != nil {
        return nil, err
    }

    return results, nil
}

  • iface.go
package ggsearchengine

import "context"

type SearchEngine interface {
    Search(ctx context.Context, pageNum, pageSize uint32, query, uule string, options ...Option) (results []*SearchResult, err error)
}
  • native_client.go
package ggsearchengine

import (
    "net/http"
    "net/url"

    "github.com/go-kratos/kratos/v2/log"
)

type NoPoolProxyHttpClient struct {
    Client        *http.Client
    PremiumClient *http.Client
}

func NewNoPoolProxyHttpClient(bc *Bootstrap, logger log.Logger) *NoPoolProxyHttpClient {
    httpConf := bc.Client.Http
    httpTransport := http.DefaultTransport.(*http.Transport).Clone()
    premiumHttpTransport := http.DefaultTransport.(*http.Transport).Clone()
    // 代理设置
    if bc.Client.Http.Proxy != nil {
        httpTransport.Proxy = setProxy(*bc.Client.Http.Proxy)
    }
    if bc.Client.Http.PremiumProxy != nil {
        premiumHttpTransport.Proxy = setProxy(*bc.Client.Http.PremiumProxy)
    }
    // 必须关闭http keep-alive 设置
    httpTransport.DisableKeepAlives = true
    premiumHttpTransport.DisableKeepAlives = true
    noPoolClient := &NoPoolProxyHttpClient{
        Client: &http.Client{
            Transport: httpTransport,
            Timeout:   httpConf.ProxyTimeout.AsDuration(),
        },
        PremiumClient: &http.Client{
            Transport: premiumHttpTransport,
            Timeout:   httpConf.PremiumProxyTimeout.AsDuration(),
        },
    }
    return noPoolClient
}

func setProxy(p string) func(*http.Request) (*url.URL, error) {
    return func(req *http.Request) (*url.URL, error) {
        iurl := url.URL{}
        return iurl.Parse(p)
    }
}

  • types.go
package ggsearchengine

import (
    "net/http"
    "net/url"
    "strings"
    "time"
)

type SearchResult struct {
    Title   string
    Content []string
    Url     string
}

type Option func(*options)

type options struct {
    metricKey            *string
    isFreeProxy          bool
    textExtractionSyntax *string
    textExtractionIndex  []int
}

func newHttpGetRequest(baseUrl string, params map[string]string) (*http.Request, error) {
    var urlBuilder strings.Builder
    urlBuilder.WriteString(baseUrl)
    urlBuilder.WriteString("?")
    flag := true
    for k, v := range params {
        if flag {
            flag = false
        } else {
            urlBuilder.WriteString("&")
        }
        urlBuilder.WriteString(k)
        urlBuilder.WriteString("=")
        urlBuilder.WriteString(url.QueryEscape(v))
    }
    return http.NewRequest("GET", urlBuilder.String(), nil)
}

// 设置监控
// 此处为公司敏感信息,请自行根据prometheus exporter 和 metric 实现
func setMonitor(metricKey *string, begin time.Time, err error) {
    if metricKey != nil {
        // 上报耗时
        if err != nil {
            // 上报错误计数
        } else {
            // 上报成功计数
        }
    }
}

func getHttpClient(nProxyClient *NoPoolProxyHttpClient, isFreeProxy bool) *http.Client {
    var httpClient *http.Client
    if isFreeProxy {
        httpClient = nProxyClient.Client
    } else {
        httpClient = nProxyClient.PremiumClient
    }

    return httpClient
}

  • useragent/data.go
package ggenerator

const (
    USER_AGENT = "User-Agent"
)

var userAgentList = []string{
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 OPR/85.0.4341.39",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
    "Mozilla/5.0 (Linux; U; Android 9; itel W6004 Build/PPR1.180610.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/79.0.3945.116 Mobile Safari/537.36 OPR/55.0.2254.56695",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",
    "Mozilla/5.0 (X11; CrOS x86_64 14268.67.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.111 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.43",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
    "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Mobile Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (Linux; U; Android 6.0.1; SM-N910C Build/MMB29K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/96.0.4664.45 Mobile Safari/537.36 OPR/62.1.2254.60552",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.62",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 15_2_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.30",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
    "Mozilla/5.0 (Linux; U; Android 10; TECNO KD6a Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/80.0.3987.99 Mobile Safari/537.36 OPR/62.3.2254.60988",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36",
}

  • useragent/useragent_generator.go
package ggenerator

import (
    "math/rand"
    "time"
)

func RandomRange(min, max int) int {
    rand.Seed(time.Now().UnixNano())
    return rand.Intn(max+1-min) + min
}

func randomSelectElements(selectableSet []string) string {
    randomIndex := RandomRange(0, len(selectableSet)-1)
    return selectableSet[randomIndex]
}

func userAgentHandler() string {
    userAgentValue := randomSelectElements(userAgentList)

    return userAgentValue
}

func HeaderGenerator(headers []string) map[string]string {
    headerMap := make(map[string]string)

    for _, header := range headers {
        switch header {
        case USER_AGENT:
            userAgentValue := userAgentHandler()
            headerMap[USER_AGENT] = userAgentValue
        default:
            break
        }
    }

    return headerMap
}

关于 uule

https://site-analyzer.pro/services-seo/uule/

  • uule.proto
syntax = "proto3";
package kratos.api;

option go_package = "uule";

message Uule {
  int32 role = 1;
  int32 producer = 2;
  string canonical_name =4;
}

你可能感兴趣的:(golang:封装 google search engine)