使用Go语言+goproxy库编写的http代理服务器+图片cache保存 第一版完成
用法:
1、安装Go 1.5.x
2、命令行设置环境变量GOSRC(Windows下设置到“我的电脑”里面)
3、安装goproxy库:go get github.com/elazarl/goproxy
4、运行脚本:go run go-httproxy-imagedumper.go
5、可以用go build创建exe版本
package main
import (
"crypto/sha1"
"errors"
"flag"
"fmt"
"io"
"log"
"net"
"net/http"
// "net/http/httputil"
"os"
"os/signal"
"path"
"regexp"
"sync"
// "time"
"github.com/elazarl/goproxy"
// "github.com/elazarl/goproxy/transport"
)
//--------------------------------------------------------------------------- Code Copied from examples/goproxy-httpdump
type FileStream struct {
path string
f *os.File
}
func NewFileStream(path string) *FileStream {
return &FileStream{path, nil}
}
func (fs *FileStream) Write(b []byte) (nr int, err error) {
if fs.f == nil {
fs.f, err = os.Create(fs.path)
if err != nil {
return 0, err
}
}
return fs.f.Write(b)
}
func (fs *FileStream) Close() error {
fmt.Println("Close", fs.path)
if fs.f == nil {
return errors.New("FileStream was never written into")
}
return fs.f.Close()
}
//--------------------------------------------------------------------------- Code Copied from examples/goproxy-httpdump
// TeeReadCloser extends io.TeeReader by allowing reader and writer to be
// closed.
type TeeReadCloser struct {
r io.Reader
w io.WriteCloser
c io.Closer
}
func NewTeeReadCloser(r io.ReadCloser, w io.WriteCloser) io.ReadCloser {
return &TeeReadCloser{io.TeeReader(r, w), w, r}
}
func (t *TeeReadCloser) Read(b []byte) (int, error) {
return t.r.Read(b)
}
// Close attempts to close the reader and write. It returns an error if both
// failed to Close.
func (t *TeeReadCloser) Close() error {
err1 := t.c.Close()
err2 := t.w.Close()
if err1 != nil {
return err1
}
return err2
}
//--------------------------------------------------------------------------- Code Copied from examples/goproxy-httpdump
// stoppableListener serves stoppableConn and tracks their lifetime to notify
// when it is safe to terminate the application.
type stoppableListener struct {
net.Listener
sync.WaitGroup
}//使用小写字母开头的名称是为了不让类型导出吗?
type stoppableConn struct {
net.Conn
wg *sync.WaitGroup
}
func newStoppableListener(l net.Listener) *stoppableListener {
return &stoppableListener{l, sync.WaitGroup{}}
}
func (sl *stoppableListener) Accept() (net.Conn, error) {
c, err := sl.Listener.Accept()
if err != nil {
return c, err
}
sl.Add(1)
return &stoppableConn{c, &sl.WaitGroup}, nil
}
func (sc *stoppableConn) Close() error {
sc.wg.Done()
return sc.Conn.Close()
}
var imageSubTypeFinder = regexp.MustCompile(`image/([a-z\-]+)`) //实际的可能取值:image/png;charset=UTF-8
func main() {
verbose := flag.Bool("v", true, "should every proxy request be logged to stdout") //调试时暂时设置为true
addr := flag.String("l", ":8888", "on which address should the proxy listen")
cache_basedir := flag.String("d", "cache-basedir", "cache directory to dump image response data")
flag.Parse()
proxy := goproxy.NewProxyHttpServer()
proxy.Verbose = *verbose
if err := os.MkdirAll(*cache_basedir, 0755); err != nil {
log.Fatal("Can't create cache base dir", err)
}
//禁止某些已经被gfw阻塞了的host的请求,以防止浏览器发生IO阻塞:
proxy.OnRequest(goproxy.ReqHostIs("ajax.googleapis.com")).DoFunc(//这个需要请求头部里指定了Host字段吧?
func(r *http.Request, ctx *goproxy.ProxyCtx) (*http.Request, *http.Response) {
return r, goproxy.NewResponse(r,
goproxy.ContentTypeText, http.StatusRequestTimeout,
"access blocked");
});
//测试网址:http://shop.oreilly.com/category/mobile/new.do 测试成功
proxy.OnResponse().DoFunc(func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response {
//过滤条件:
// 1, 状态码==200
// 2, 响应的Content-Type=="image/*"
//注意: goproxy当前似乎不能处理chunked/gzipped响应body,所以数据是以原始格式保存的(这对于浏览器客户端没有影响)
if ctx.Req.Method != "GET" {
return resp;
}
if resp==nil {
return resp; //goproxy特有的架构设计;不管
}
if resp.StatusCode != 200 {
return resp; //let it pass;
}
contentTypeHeaderValue := resp.Header.Get("Content-Type")
if contentTypeHeaderValue == "" {
//TODO:即使响应头部里没有指定content-type,照理说仍然可以通过尝试decode的方法来判断是否图像数据
return resp;
}
m := imageSubTypeFinder.FindStringSubmatch(contentTypeHeaderValue)
if m==nil {
return resp; //不是图像类型,不处理
}
imageSubType := m[1]
imageFileSuffix := imageSubType
switch imageFileSuffix{
case "x-icon":
imageFileSuffix = "ico"
case "jpeg":
imageFileSuffix = "jpg"
}
//现在可以存储图像数据到本地文件了,存储格式分为2个文件:
// (1) <图像原始url的SHA1哈希> ".headers"
// 其中,第一行是GET <url>, 然后空行,后面跟响应头部; 再空行,后面跟请求头部数据
// (2) <图像原始url的SHA1哈希> "." imageFileSuffix
//这2个文件成对出现,并可能映射到cache基准目录下的子目录(一般操作系统同一目录下创建太多文件可能影响性能)
//如此设计可以保证高性能的根据指定url返回图像数据的API,但是还缺少一个枚举所有已经缓存的图像文件的原始url,
// TODO:是否使用数据库如sqlite保存这个信息?
//TODO:文件的cache保存IO操作最好如此设计:
// (1)响应body数据一开始放到某个内存bufio对象里,只有当响应数据长度>4KB时才开始写到本地文件,这可以避免无用的小文件cache
// (2)最好不是在单独的goproxy handler里直接写文件,而是发送到专门的IO goroutine处理,以提高IO调度的性能?
//FIXME:可能存在同一个请求url,远程服务器根据其他请求头部的不同(比如cookie?)返回不同响应数据的情况,当前不考虑这种情况
image_url := ctx.Req.URL.String()
h := sha1.New()
h.Write([]byte(image_url))
image_url_hash := fmt.Sprintf("%x", h.Sum(nil))
log.Println("Url=%s SHA1=%s", image_url, image_url_hash)
//使用此hash值作为文件名,这里有个文件:假如对应文件已经存在呢?
fs_headers := NewFileStream(path.Join(*cache_basedir, fmt.Sprintf("%s.headers", image_url_hash)))
fmt.Fprintf(fs_headers, "GET %s\r\n", image_url)
fmt.Fprintf(fs_headers, "\r\n");
for k, v := range resp.Header {
fmt.Fprintf(fs_headers, "%s: %s\r\n", k, v[0]) //type Header map[string][]string
}
fmt.Fprintf(fs_headers, "\r\n");
for k, v := range ctx.Req.Header {
fmt.Fprintf(fs_headers, "%s: %s\r\n", k, v[0])
}
fs_headers.Close()
fs_body := NewFileStream(path.Join(*cache_basedir, fmt.Sprintf("%s.%s", image_url_hash, imageFileSuffix)))
resp.Body = NewTeeReadCloser(resp.Body, fs_body) //这里的响应body数据的dump模仿goproxy-httpdump例子的写法
return resp
})
listen_addr, err := net.Listen("tcp", *addr)
if err != nil {
log.Fatal("listen:", err)
}
sl := newStoppableListener(listen_addr)
ch := make(chan os.Signal) //从这个chan中读数据能够自动取得用户的键盘中断信号?
signal.Notify(ch, os.Interrupt)
go func() {
<-ch
log.Println("Got SIGINT exiting")
sl.Add(1)
sl.Close()
sl.Done()
}()
log.Println("Starting Go HttpProxy-ImageDumper...")
http.Serve(sl, proxy)
sl.Wait()
log.Println("All connections closed - exit")
}
已知问题:Ctrl+C试图结束程序运行时,程序不能立即退出。