【GoLang笔记】A Tour of Go - Exercise: Web Crawler

本文是GoLang学习教程中的一道习题,具体题目要求及代码实现如下。

备注:由于天朝GFW屏蔽了GAE,所以GoLang官网及学习教程需要才能访问。

Exercise: Web Crawler

In this exercise you'll use Go's concurrency features to parallelize a web crawler.
Modify the Crawl function to fetch URLs in parallel without fetching the same URL twice.

题目需要实现一个不会重复抓取已抓页面的并发爬虫。

下面是需要修改的原题代码:

package main

import (
    "fmt"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    // This implementation doesn't do either:
    if depth <= 0 {
    	return
    }
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
    	fmt.Println(err)
    	return
    }
    fmt.Printf("found: %s %q\n", url, body)
    for _, u := range urls {
    	Crawl(u, depth-1, fetcher)
    }
    return
}

func main() {
    Crawl("http://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
 	return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "http://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "http://golang.org/pkg/",
            "http://golang.org/cmd/",
        },
    },
    "http://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "http://golang.org/",
            "http://golang.org/cmd/",
            "http://golang.org/pkg/fmt/",
            "http://golang.org/pkg/os/",
        },
    },
    "http://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
    "http://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
}

满足题目要求的一种修改Crawl()函数的参考实现如下:

package main

import (
    "fmt"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    if depth <= 0 {
        return
    }

    type fetched_res struct {
        url   string
        body  string
        urls  []string
        depth int
        err   error
    }

    // fetched_set is used for url deduplication
    fetched_set := make(map[string]bool)
    
    // fetched_ch is used to sync among goroutines
    fetched_ch := make(chan *fetched_res)
    
    // fetch_routine is a function value used to do the real crawl job and send result to channel
    fetch_routine := func(url string, depth int) {
        body, urls, err := fetcher.Fetch(url)
        fetched_ch <- &fetched_res{url, body, urls, depth, err}
    }

    // start with the seed url
    go fetch_routine(url, depth)

    for progress := 1; progress > 0; progress-- {
        res_ptr := <- fetched_ch
		
        // when exception occured during fetch_routine, the total progress should minus 1
        // so, error should be handle just here (rather than in function "fetch_routine") to control the fetching progress
        if res_ptr.err != nil {
            fmt.Println(res_ptr.err)
            continue
        }

        fmt.Printf("found: %s %q\n", res_ptr.url, res_ptr.body)

        // add to fetched url set
        fetched_set[res_ptr.url] = true

        // crawl recursively if max_depth not reached
        cur_depth := res_ptr.depth - 1
        if cur_depth > 0 {
            for _, candidate := range res_ptr.urls {
                if !fetched_set[candidate] {
                    progress++
                    go fetch_routine(candidate, cur_depth)
                } else {
                    fmt.Printf("fetched already: %s\n", candidate)
                    continue
                }
            }
        }
    }

    return
}

func main() {
    Crawl("http://golang.org/", 2, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
	return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "http://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "http://golang.org/pkg/",
            "http://golang.org/cmd/",
        },
    },
    "http://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "http://golang.org/",
            "http://golang.org/cmd/",
            "http://golang.org/pkg/fmt/",
            "http://golang.org/pkg/os/",
        },
    },
    "http://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
    "http://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
}
题目并非真的通过网络去抓页面,而是通过fakeFetcher来模拟层级页面。

还需要说明的是,当要抓取的url不是fakeFetcher这个map结构的key时,抓取会返回error,这相当于模拟了页面404的异常,在实现Crawl()函数的并发抓取逻辑时,需要正确处理这个异常以控制抓取loop的终止条件,否则,代码在golang官网提供的playgroup运行时会报错(channel死锁)。

====================== EOF =======================


你可能感兴趣的:(GoLang,GoLang)