http://themeforest.net 内有很多的设计很好的网站模板, 平时没事就在那找漂亮的模板收藏, 但都收费, 不过既然那些模板都有Demo写个爬虫不就可以全部下载? 于是就快速用PHP写了一个模板下载程序, 但速度欠佳, 不支持多线程. 又因之前断断续续学了Go语言, 干脆用它来重写了, 使用了Goroutine速度快了很多.
正在学习Go语言的同学们, 看到模板想下载的同学们可以用用.
github地址: https://github.com/lealife/Lea-Web-Template-Spider
附主要代码:
<!-- lang: cpp -->
package lealife
import (
"io/ioutil"
"net/http"
"strings"
"regexp"
"log"
"os"
"path/filepath"
"lealife/util"
"sync"
)
type LeaSpider struct {
indexUrl string
scheme string // http:// 或 https://
host string // www.lealife.com lealife.com
schemeAndHost string // http://lealife.com
targetPath string
noChildrenFileExts []string
hadDoneUrl map[string] bool
exceptionUrl map[string] bool
defaultFilename string // 生成的文件名
t int
goroutineNum int // 正在运行的goroutine数目
lock *sync.Mutex
// 并发
w sync.WaitGroup
ch chan bool
}
// 实例化LeaSpider
func NewLeaSpider() *LeaSpider {
lea := &LeaSpider{
targetPath: "D:",
defaultFilename: "index.html",
t: 1,
goroutineNum: 0,
lock: &sync.Mutex{},
noChildrenFileExts: []string{".js", ".ico", ".png", ".jpg", ".gif"}}
lea.ch = make(chan bool, 1000) // 仅limit个goroutine
lea.hadDoneUrl = make(map[string]bool, 1000)
lea.exceptionUrl = make(map[string]bool, 1000)
lea.setLogOutputWriter()
return lea
}
// 入口
func (this *LeaSpider) Fetch(url, targetPath string) {
url = strings.TrimSpace(url)
this.parseUrl(url)
// 保存路径
this.doTargetPath(targetPath)
// 去掉scheme
// a.com, a.com/index.html
url = util.Substring(url, len(this.scheme))
// url2, ok := this.getRalativeUrl("a.com/b/c/d/kk/eee.html", "http://a.com/e/c/d/kk")
// println(url2)
// println(ok)
// return
this.goDo(url, false)
this.w.Wait()
// 处理异常
this.doExceptionUrl()
}
// go routine do it
func (this *LeaSpider) goDo(url string, needException bool) {
// this.do(url, false)
this.w.Add(1)
// println(">>>>>>>>>>>>申请资源" + url)
this.ch <- true // 使用资源
// println(">>>>>>>>>>>>申请资源成功" + url)
this.lock.Lock()
this.goroutineNum++
log.Println("当前共运行", this.goroutineNum, "goroutine")
this.lock.Unlock()
go func() {
defer func() {
this.w.Done()
}()
children := this.do(url, needException)
this.lock.Lock()
this.goroutineNum--
log.Println("当前共运行", this.goroutineNum, " goroutine")
this.lock.Unlock()
// println("<<<<<<<<<<<<<释放资源" + url)
<-this.ch // 释放资源
for _, cUrl := range children {
this.goDo(cUrl, false)
}
}()
}
// needException 需要处理异常?
// 这里的url可能是: a.com/b/c/d(没有schema), 不是以.html, .css, .js为后缀的
// 那么断定是一个页面, 此时自动生成一个文件名 => a.com/b/c/d/d_leaui_index.html
// 生成的文件名都按一个规则即可, 不必事先mapping
// 返回一个[]string 待处理的子
func (this *LeaSpider) do(url string, needException bool) (children []string) {
children = nil
url = this.trimUrl(url)
if this.isNotNeedUrl(url, needException) {
return;
}
// 文件是否已存在
// url = a.com/a/?id=12&id=1221, 那么genUrl=a.com/a/index.html?id=121
genUrl := this.genUrl(url)
if this.isExists(genUrl) {
return;
}
// 得到内容
fullUrl := this.scheme + url
if needException {
log.Println("正在处理 `异常` " + fullUrl)
} else {
log.Println("正在处理 " + fullUrl)
}
content, err := this.getContent(fullUrl)
if !needException && (err != nil || content == "") { // !needException防止处理异常时无限循环
this.exceptionUrl[url] = true
return;
}
this.hadDoneUrl[url] = true
ext := strings.ToLower(filepath.Ext(this.trimQueryParams(url))) // 很可能是a.css?v=1.3
// css文件中 url(../../img/search-icon.png)
if(ext == ".css") {
children = this.doCSS(url, content)
return;
}
// 如果是js, image文件就不往下执行了
if(util.InArray(this.noChildrenFileExts, ext)) {
// 保存该文件
if !this.writeFile(url, content) {
return;
}
return;
}
if(this.t == 1) {
// 解析html里的href, src
children = this.doHTML(url, genUrl, content)
}
return
}
// 处理css
func (this *LeaSpider) doCSS(url, content string) (children []string) {
children = nil
// 保存该文件
if !this.writeFile(url, content) {
return;
}
regular := "(?i)url\\((.+?)\\)"
reg := regexp.MustCompile(regular)
re := reg.FindAllStringSubmatch(content, -1)
log.Println(url + " 含有: ");
log.Println(re);
baseDir := filepath.Dir(url)
for _, each := range re {
cUrl := this.trimUrl(each[1])
// 这里, goDo会申请资源, 导致doCSS一直不能释放资源
children = append(children, this.cleanUrl(baseDir + "/" + cUrl))
}
return
}
// url : a.com/a/b/d.html
// a.com/a/b/c genFilename: c_leaui_index.html
// 生成子的相对目录有用
func (this *LeaSpider) doHTML(pUrl, realPUrl, content string) (children []string) {
regular := "(?i)(src=|href=)[\"']([^#].*?)[\"']"
reg := regexp.MustCompile(regular)
re := reg.FindAllStringSubmatch(content, -1)
log.Println(pUrl + " => " + realPUrl);
log.Println(pUrl + " 含有: ");
//log.Println(re);
baseDir := filepath.Dir(realPUrl)
for _, each := range re {
// 为了完整替换
// 只替换src=""里的会有子串的问题, 一个url是另一个url子串
rawFullUrl := each[0] // src='http://www.uiueux.com/wp/webzine/wp-content/themes/webzine/js/googlefont.js.php?ver=1.6.4'
rawFullUrlPrefix := each[1]; // src=
// http://a.com/, /a/b/c/d.html, /a/b.jgp
// 如果是/a/b.jpg, 那么是相对host的, 而不是本文件的路径
rawCUrl := each[2]
cUrl := rawCUrl; // strings.TrimRight(rawCUrl, "/") // 万一就是/呢?
// 如果一个链接以//开头, 那么省略了http:, 如果以/开头, 则相对于host
prefixNotHttp := false
if strings.HasPrefix(cUrl, "//") {
cUrl = this.scheme + util.Substring(cUrl, 2)
prefixNotHttp = true
} else if strings.HasPrefix(cUrl, "/") {
cUrl = this.schemeAndHost + cUrl
}
// 如果这个url是一个目录, 新建一个文件
// 如果这个url是以http://a.com开头的, host是一样的,
// 那么content的url是相对于该url
// 生成的url, 如果是目录, 会生成一个文件
cRealUrl, ok := this.getRalativeUrl(realPUrl, cUrl)
// 错误, 不是本页面, 本host的页面
if ok == -1 {
// 如果之前//替换成了http://
if prefixNotHttp {
content = strings.Replace(content, rawFullUrl, rawFullUrlPrefix + "\"" + cRealUrl + "\"", -1)
}
continue
}
// 表示已处理过, 是相对目录了, 必须把内容的替换掉
// 但要处理的还是之前的链接http://
if ok == 1 {
cRealUrl = strings.Trim(cRealUrl, "/")
// 把//变成/
for strings.Index(cRealUrl, "//") != -1 {
cRealUrl = strings.Replace(cRealUrl, "//", "/", -1)
}
log.Println(rawCUrl + " >>>>>> " + cRealUrl)
content = strings.Replace(content, rawFullUrl, rawFullUrlPrefix + "\"" + cRealUrl + "\"", -1)
cUrl = strings.Replace(cUrl, this.scheme, "", 1) // 把sheme去掉, do
children = append(children, cUrl) // 不需要clean
} else {
children = append(children, this.cleanUrl(baseDir + "/" + cRealUrl))
}
}
// 把content保存起来
if !this.writeFile(realPUrl, content) {
return;
}
// this.t++
// return
return
}
// 得到相对目录
// realPUrl: a.com/b/c/index.html 不是a.com/b/c
// cUrl如果是以this.scheme + this.host开头, 则需要转换成相对目录
// cUrl a.com/c/d/e/g
// 在realPUrl页面到cUrl跳转
func (this *LeaSpider) getRalativeUrl(realPUrl, cUrl string) (url string, ok int) {
ok = 0
url = cUrl
if strings.HasPrefix(cUrl, this.scheme + this.host) {
url = ""
ok = 1
realCUrl := this.genUrl(cUrl) // 如果是目录, 生成一个
// 如果realPUrl == realCurl 那么返回"#"
realPUrl = strings.Replace(realPUrl, this.host, "", 1) // 去掉a.com
realCUrl = strings.Replace(realCUrl, this.scheme + this.host, "", 1) // 去掉http://a.com
realPUrl = this.trimUrl(realPUrl)
realCUrl = this.trimUrl(realCUrl)
if realPUrl == realCUrl {
url = "#"
return
}
// 去掉两个url相同的部分
realPUrlArr := strings.Split(realPUrl, "/")
realCUrlArr := strings.Split(realCUrl, "/")
log.Println(realPUrlArr)
log.Println(realCUrlArr)
i, j := 0, 0
for ; i < len(realCUrlArr) && j < len(realPUrlArr) && realCUrlArr[i] == realPUrlArr[j]; {
realCUrlArr[i] = ""
i++
j++
}
// 有多个少../?
n := len(realPUrlArr) - i - 1
for k := 0; k < n; k++ {
url += "../"
}
url += strings.Join(realCUrlArr, "/")
return;
}
// 如果是以http://, https://开头的, 返回false
if strings.HasPrefix(cUrl, "http://") || strings.HasPrefix(cUrl, "https://") {
ok = -1
return
}
return
}
// trimSpace, /, \, ", '
func (this *LeaSpider) trimUrl(url string) string {
if(url != "") {
url = strings.TrimSpace(url)
url = strings.Trim(url, "\"")
url = strings.Trim(url, "'")
url = strings.Trim(url, "/")
url = strings.Trim(url, "\\")
}
return url
}
// 处理异常
func (this *LeaSpider) doExceptionUrl() {
if(len(this.exceptionUrl) > 0) {
log.Println("正在处理异常Url....");
for url, _ := range this.exceptionUrl {
this.do(url, true)
}
}
}
// 如果url是 a.com/b/c/d
// 生成一个文件a.com/b/c/d/d_leaui_index.html
// 返回 d_leaui_index.html
// 如果不是一个目录, 返回""
func (this *LeaSpider) genFilename(url string) (string, bool) {
urlArr := strings.Split(url, "/")
if urlArr != nil {
last := urlArr[len(urlArr) - 1]
ext := strings.ToLower(filepath.Ext(last))
if ext == "" {
return this.defaultFilename, true // 需要append到url后面
} else if util.InArray([]string{".php", ".jsp", ".asp", ".aspx"}, ext) {
filename := filepath.Base(last) // a.php
filename = util.Substr(filename, 0, len(filename) - len(ext)) // a
return filename + ".html", false
}
}
return "", true;
}
// 生成真实的url
// 传来的url可能是http://a.com, 也可能是a.com
// getRelativeUrl传来的可以是http://a.com
// url = a.com/a/?id=12&id=1221, 那么genUrl=a.com/a/index.html?id=121
func (this *LeaSpider) genUrl(url string) string {
// 去掉?后面的
queryParam, fragment := "", "" // 包含?,#
pos := strings.Index(url, "?");
if pos != -1 {
queryParam = util.Substring(url, pos)
url = util.Substr(url, 0, pos);
} else {
pos = strings.Index(url, "#");
if pos != -1 {
fragment = util.Substring(url, pos)
url = util.Substr(url, 0, pos);
}
}
// 如果url == host
if url == this.host || url == this.schemeAndHost {
return url + "/" + this.defaultFilename + queryParam + fragment
}
genFilename, needApend := this.genFilename(url)
if genFilename != "" {
if needApend {
url += "/" + genFilename + queryParam + fragment
} else {
// 是a.php => a.html
urlArr := strings.Split(url, "/")
urlArr = urlArr[:len(urlArr)-1]
url = strings.Join(urlArr, "/") + "/" + genFilename
}
}
return url
}
func (this *LeaSpider) writeFile(url, content string) bool {
// $path = a.html?a=a11
url = this.trimQueryParams(url)
fullPath := this.targetPath + "/" + url
dir := filepath.Dir(fullPath)
log.Println("写目录", dir);
if err := os.MkdirAll(dir, 0777); err != nil {
log.Println("写目录" + dir + " 失败")
return false
}
// 写到文件中
file, err := os.Create(fullPath)
defer file.Close()
if err != nil {
log.Println("写文件" + fullPath + " 失败")
return false
}
file.WriteString(content)
return true;
}
func (this *LeaSpider) cleanUrl(url string) string {
url = filepath.Clean(url)
return strings.Replace(url, "\\", "/", -1)
}
// 将url ?, #后面的字符串去掉
func (this *LeaSpider) trimQueryParams(url string) string {
pos := strings.Index(url, "?");
if pos != -1 {
url = util.Substr(url, 0, pos);
}
pos = strings.Index(url, "#");
if pos != -1 {
url = util.Substr(url, 0, pos);
}
return url;
}
// 判断是否已存在
// url = a/b/c/d.html
func (this *LeaSpider) isExists(url string) bool {
return util.IsExists(this.targetPath + "/" + url)
}
// 不需要处理的url
// needException false 表示不要处理, 那么就要判断是否在其中
func (this *LeaSpider) isNotNeedUrl(url string, needException bool) bool {
if _, ok := this.hadDoneUrl[url]; ok {
return true
}
_, ok := this.exceptionUrl[url];
if !needException && ok {
return true
}
// http:\\/|https:\\/|
regular := "#|javascript:|mailto:|" class=|@.*?\\..+"
reg := regexp.MustCompile(regular)
if reg.MatchString(url) {
return true
}
if (strings.HasPrefix(url, "http:/") || strings.HasPrefix(url, "https:/")) &&
!strings.HasPrefix(url, this.scheme + this.host) {
return true
}
return false
}
// 处理url, 得到scheme, host
func (this *LeaSpider) parseUrl(url string) {
if(strings.HasPrefix(url, "http://")) {
this.scheme = "http://";
} else {
this.scheme = "https://";
}
// http://lealife.com/b/c
url = strings.Replace(url, this.scheme, "", 1)
index := strings.Index(url, "/")
if(index == -1) {
this.host = url
} else {
this.host = util.Substr(url, 0, index)
}
this.schemeAndHost = this.scheme + this.host
}
func (this *LeaSpider) getNoChildrenFileExts() []string {
return this.noChildrenFileExts;
}
// 得到内容
func (this *LeaSpider) getContent(url string) (content string, err error) {
var resp *http.Response
resp, err = http.Get(url)
if(resp != nil && resp.Body != nil) {
defer resp.Body.Close()
} else {
log.Println("ERROR " + url + " 返回为空 ")
}
if resp == nil || resp.Body == nil || err != nil || resp.StatusCode != http.StatusOK {
log.Println("ERROR " + url)
log.Println(err)
return
}
var buf []byte
buf, err = ioutil.ReadAll(resp.Body)
if(err != nil) {
return
}
content = string(buf);
return
}
// 生成存储位置
func (this *LeaSpider) doTargetPath(path string) {
path = strings.TrimRight(path, "/"); // 不能TrimLeft, 万一是linux呢?
path = strings.Trim(path, "\\");
if path != "" {
this.targetPath = path;
}
// 生成目录
if this.targetPath != "" {
os.MkdirAll(this.targetPath, 0777)
} else {
panic("存储位置异常")
}
}
func (this *LeaSpider) setLogOutputWriter() {
/*
logfile, err := os.OpenFile("C:/Users/Administrator/workspace/lea/log.txt", os.O_RDWR|os.O_CREATE, 0);
if err != nil {
log.Printf("%s\r\n", err.Error());
os.Exit(-1);
}
log.SetOutput(logfile)
*/
}