用 golang 可以很方便的爬图(http 下载图片,存储为 jpg 格式)。
http client 有如下最佳实践:
正向代理、反向代理 区别
package main
import (
"log"
"net/http"
"net/http/httputil"
"net/url"
)
// NewProxy 拿到 remoteURL 后,创建一个反向代理
func NewProxy(remoteURL string) (*httputil.ReverseProxy, error) {
remote, err := url.Parse(remoteURL)
if err != nil {
return nil, err
}
proxy := httputil.NewSingleHostReverseProxy(remote)
proxy.Director = func(req *http.Request) {
//req.Header = ctx.Request.Header
req.Host = remote.Host
req.URL.Scheme = remote.Scheme
req.URL.Host = remote.Host
req.URL.Path = "/metrics"
}
return proxy, nil
}
// ProxyRequestHandler 使用 proxy 处理请求
func ProxyRequestHandler(proxy *httputil.ReverseProxy) func(http.ResponseWriter, *http.Request) {
return func(w http.ResponseWriter, r *http.Request) { proxy.ServeHTTP(w, r) }
}
func main() {
proxy, err := NewProxy("http://127.0.0.1:7000/metrics") // 初始化反向代理并传入真正后端服务的地址
if err != nil {
panic(err)
}
http.HandleFunc("/api/proxy/metrics", ProxyRequestHandler(proxy)) // 使用 proxy 处理所有请求到你的服务
log.Fatal(http.ListenAndServe(":8080", nil))
}
package main
import (
"github.com/gin-gonic/gin"
"net/http"
"net/http/httputil"
"net/url"
)
// NewProxy 拿到 remoteURL 后,创建一个反向代理
func NewProxy(remoteURL string) (*httputil.ReverseProxy, error) {
remote, err := url.Parse(remoteURL)
if err != nil {
return nil, err
}
proxy := httputil.NewSingleHostReverseProxy(remote)
proxy.Director = func(req *http.Request) {
//req.Header = ctx.Request.Header
req.Host = remote.Host
req.URL.Scheme = remote.Scheme
req.URL.Host = remote.Host
req.URL.Path = "/metrics"
}
return proxy, nil
}
func proxy(c *gin.Context) {
proxy, err := NewProxy("http://127.0.0.1:7000/metrics") // 初始化反向代理并传入真正后端服务的地址
if err != nil {
panic(err)
}
proxy.ServeHTTP(c.Writer, c.Request)
}
func main() {
r := gin.Default()
r.GET("/api/proxy/metrics", proxy)
r.Run(":8080")
}
package clients
import (
"fmt"
"github.com/sirupsen/logrus"
"io"
"net/http"
"os"
"strings"
"time"
)
var httpCli = &http.Client{
Timeout: 10 * time.Second,
}
type spiderCli struct {
httpCli *http.Client
txtFile *os.File
}
func Spider() *spiderCli {
txtFile, err := os.OpenFile(configs.TxtFileName(), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
logrus.Fatal(err)
}
return &spiderCli{
httpCli: httpCli,
txtFile: txtFile,
}
}
func (c *spiderCli) Download(url string) ([]byte, error) {
res, err := c.httpCli.Get(url)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode == http.StatusNotFound {
return nil, fmt.Errorf("got invalid status code: %v", res.StatusCode)
}
return io.ReadAll(res.Body)
}
// Save img.jpg and txt
func (c *spiderCli) Save(fid int64, bytes []byte, plateText, plateColor, plateType string) error {
// img
fImgName := fmt.Sprintf("%v.jpg", fid)
fImg, err := os.Create(fImgName)
if err != nil {
return err
}
defer fImg.Close()
writtenImg, err := fImg.Write(bytes)
if err != nil {
return err
}
if writtenImg == 0 {
os.Remove(fImgName)
}
// txt
line := strings.Join([]string{fImgName, plateText, plateColor, plateType}, ",")
writtenTxt, err := c.txtFile.WriteString(line + "\n")
_ = writtenTxt
return err
}
func (c *spiderCli) Close() {
c.txtFile.Close()
}