Heritrix 3.1.0 源码解析(二十七)

上文分析了Heritrix3.1.0系统的对请求认证机制的封装,本文接下来分析Heritrix3.1.0系统对cookies的处理的封装

Heritrix3.1.0系统提供了CookieStorage接口,用于提供cookies的存储

CookieStorage接口很简单,声明了保存cookies对象的Map容器的方法和获取cookies对象的Map容器的方法

public interface CookieStorage extends Lifecycle {



    SortedMap<String,Cookie> getCookiesMap();



    void saveCookiesMap(Map<String,Cookie> map);



}

抽象类AbstractCookieStorage实现了CookieStorage接口,用于为具体实现类提供公用模板 

public abstract class AbstractCookieStorage 

    implements CookieStorage, 

               Lifecycle, // InitializingBean, 

               Closeable {



    final private static Logger LOGGER = 

        Logger.getLogger(AbstractCookieStorage.class.getName());

    //cookies配置文件(用于加载)

    protected ConfigFile cookiesLoadFile = null;

    public ConfigFile getCookiesLoadFile() {

        return cookiesLoadFile;

    }

    public void setCookiesLoadFile(ConfigFile cookiesLoadFile) {

        this.cookiesLoadFile = cookiesLoadFile;

    }



    //cookies文件路径(用于保存)

    protected ConfigPath cookiesSaveFile = null;

    public ConfigPath getCookiesSaveFile() {

        return cookiesSaveFile;

    }

    public void setCookiesSaveFile(ConfigPath cookiesSaveFile) {

        this.cookiesSaveFile = cookiesSaveFile;

    }



    boolean isRunning = false; 

    /**

     * 初始化

     */

    @Override

    public void start() {

        if(isRunning()) {

            return;

        }

        SortedMap<String,Cookie> cookies = prepareMap();

        if (getCookiesLoadFile()!=null) {

            //从cookies配置文件加载cookies

            loadCookies(getCookiesLoadFile(), cookies);

        }

        isRunning = true; 

    }

    @Override

    public boolean isRunning() {

        return isRunning;

    }

    @Override

    public void stop() {

        isRunning = false; 

    }

    /**

     * 初始化SortedMap<String,Cookie> 由具体子类实现

     * @return

     */

    protected abstract SortedMap<String,Cookie> prepareMap();    

    

    /**

     * 从Reader reader对象加载cookies

     * @param reader

     * @param cookies

     */

    public static void loadCookies(Reader reader,

            SortedMap<String, Cookie> cookies) {

        BufferedReader br = new BufferedReader(reader);

        try {

            String line;

            int lineNo = 1;

            while ((line = br.readLine()) != null) {

                if (!line.matches("\\s*(?:#.*)?")) { // skip blank links and comments

                    String[] tokens = line.split("\\t");

                    if (tokens.length == 7) {

                        long epochSeconds = Long.parseLong(tokens[4]);

                        Date expirationDate = (epochSeconds >= 0 ? new Date(epochSeconds * 1000) : null);

                        Cookie cookie = new Cookie(tokens[0], tokens[5],

                                tokens[6], tokens[2], expirationDate, 

                                Boolean.valueOf(tokens[3]).booleanValue());

                        cookie.setDomainAttributeSpecified(Boolean.valueOf(tokens[1]).booleanValue());

                        

                        LOGGER.fine("Adding cookie: domain " + cookie.getDomain() + " cookie " + cookie.toExternalForm());

                        cookies.put(cookie.getSortKey(), cookie);

                    } else {

                        LOGGER.warning("cookies input line " + lineNo + " invalid, expected 7 tab-delimited tokens");

                    }

                }

                

                lineNo++;

            }

        } catch (IOException e) {

            LOGGER.log(Level.WARNING,e.getMessage(), e);

        }

    }

    /**

     * 从配置文件加载SortedMap<String, Cookie> cookies

     * @param file

     * @param cookies

     */

    protected static void loadCookies(ConfigFile file,

            SortedMap<String, Cookie> cookies) {

        

        Reader reader = null;

        try {

            reader = file.obtainReader();

            loadCookies(reader, cookies);

        } finally {

            IOUtils.closeQuietly(reader);

        }

    }



    public static void loadCookies(String cookiesFile, 

            SortedMap<String,Cookie> result) {



        // Do nothing if cookiesFile is not specified.

        if (cookiesFile == null || cookiesFile.length() <= 0) {

            return;

        }

        

        FileReader reader = null;

        try {

            reader = new FileReader(cookiesFile);

            loadCookies(reader, result);

        } catch (FileNotFoundException e) {

            LOGGER.log(Level.WARNING,"Could not find file: " + cookiesFile, e);

        } finally {

            IOUtils.closeQuietly(reader);

        }

    }

    /**

     * 保存map容器中的cookies到文件

     * @param saveCookiesFile

     * @param cookies

     */

    public static void saveCookies(String saveCookiesFile, Map<String,Cookie> cookies) { 

        // Do nothing if cookiesFile is not specified. 

        if (saveCookiesFile == null || saveCookiesFile.length() <= 0) { 

            return; 

        }

      

        FileOutputStream out = null; 

        try { 

            out = new FileOutputStream(new File(saveCookiesFile)); 

            String tab ="\t"; 

            out.write("# Heritrix Cookie File\n".getBytes()); 

            out.write("# This file is the Netscape cookies.txt format\n\n".getBytes()); 

            for (Cookie cookie: cookies.values()) { 

                // Guess an initial size 

                MutableString line = new MutableString(1024 * 2); 

                line.append(cookie.getDomain()); 

                line.append(tab);

                line.append(cookie.isDomainAttributeSpecified() ? "TRUE" : "FALSE"); 

                line.append(tab); 

                line.append(cookie.getPath());

                line.append(tab); 

                line.append(cookie.getSecure() ? "TRUE" : "FALSE"); 

                line.append(tab);

                line.append(cookie.getExpiryDate() != null ? cookie.getExpiryDate().getTime() / 1000 : -1);

                line.append(tab);

                line.append(cookie.getName());

                line.append(tab);                

                line.append(cookie.getValue() != null ? cookie.getValue() : ""); 

                line.append("\n");

                out.write(line.toString().getBytes()); 

            } 

        } catch (IOException e) {

            LOGGER.log(Level.SEVERE, "Unable to write " + saveCookiesFile, e);

        } finally {

            IOUtils.closeQuietly(out);

        } 

    }

    /**

     * 具体子类实现

     */

    @Override

    public abstract SortedMap<String,Cookie> getCookiesMap();

    /**

     * 保存map容器中的cookies

     */

    @Override

    public void saveCookiesMap(Map<String, Cookie> map) {

        //抽象方法由具体子类实现

        innerSaveCookiesMap(map);

        if (getCookiesSaveFile()!=null) {

            saveCookies(getCookiesSaveFile().getFile().getAbsolutePath(), map);

        }

    }

    /**

     * 具体子类实现

     * @param map

     */

    protected abstract void innerSaveCookiesMap(Map<String,Cookie> map);

    @Override

    public void close() throws IOException {

    }



}

Heritrix3.1.0提供了两个继承类,分别为BdbCookieStorage和SimpleCookieStorage,前者将cookies保存在BDB数据库,后者保存在Map对象里面

BdbCookieStorage类的相关方法如下

protected BdbModule bdb;

    @Autowired

    public void setBdbModule(BdbModule bdb) {

        this.bdb = bdb;

    }

    

    /** are we a checkpoint recovery? (in which case, reuse stored cookie data?) */

    boolean isCheckpointRecovery = false; 

    

    public static String COOKIEDB_NAME = "http_cookies";

 

    private transient Database cookieDb;

    private transient StoredSortedMap<String,Cookie> cookies;



    public BdbCookieStorage() {

    }



    protected SortedMap<String,Cookie> prepareMap() {

        try {

            StoredClassCatalog classCatalog = bdb.getClassCatalog();

            BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();

            dbConfig.setTransactional(false);

            dbConfig.setAllowCreate(true);

            cookieDb = bdb.openDatabase(COOKIEDB_NAME, dbConfig, isCheckpointRecovery);

            cookies = 

                new StoredSortedMap<String,Cookie>(

                    cookieDb,

                    new StringBinding(), 

                    new SerialBinding<Cookie>(classCatalog,Cookie.class), 

                    true);

            return cookies;

        } catch (DatabaseException e) {

            throw new RuntimeException(e);

        }

    }



    public SortedMap<String, Cookie> getCookiesMap() {

//        assert cookies != null : "cookie map not set up";

        return cookies;

    }



    protected void innerSaveCookiesMap(Map<String, Cookie> map) {

    }

SimpleCookieStorage类与之类似,不在这里贴出来了

这里需要注意的是,Heritrix3.1.0系统改写了HttpClient组件的Cookie类,逻辑与HttpClient组件的Cookie类类似

那么Heritrix3.1.0系统怎样将CookieStorage接口实现类获取的SortedMap<String, Cookie>容器中的Cookies添加在HttpClient组件的相关对象呢?

Heritrix3.1.0系统还改写了HttpClient组件的HttpState类,添加了设置SortedMap cookiesMap对象的方法,相关方法如下

private SortedMap cookiesMap = new ConcurrentSkipListMap();

// START IA/HERITRIX ADDITIONS

    /**

     * Returns a sorted map of {@link Cookie cookies} that this HTTP

     * state currently contains.

     * 

     * Any operations on this map should be synchronized with respect 

     * to this HttpState instance.

     * 

     * @return sorter map of {@link Cookie cookies}

     */

    public SortedMap getCookiesMap() {

        return cookiesMap;

    }

    

    /**

     * Replace the standard sorted map with an external implemenations 

     * (such as one backed by persistent store, like BDB's StoredSortedMap.)

     * 

     * @param map alternate sorted map to use to store cookies

     */

    public void setCookiesMap(SortedMap map) {

        this.cookiesMap = map;

    }

// END IA/HERITRIX ADDITIONS

同时HttpMethodBase对象相关方法里面从HttpState state对象获取Cookies对象也做了相应的改写 

/**

     * Generates <tt>Cookie</tt> request headers for those {@link Cookie cookie}s

     * that match the given host, port and path.

     *

     * @param state the {@link HttpState state} information associated with this method

     * @param conn the {@link HttpConnection connection} used to execute

     *        this HTTP method

     *

     * @throws IOException if an I/O (transport) error occurs. Some transport exceptions

     *                     can be recovered from.

     * @throws HttpException  if a protocol exception occurs. Usually protocol exceptions 

     *                    cannot be recovered from.

     */

    protected void addCookieRequestHeader(HttpState state, HttpConnection conn)

        throws IOException, HttpException {



        LOG.trace("enter HttpMethodBase.addCookieRequestHeader(HttpState, "

                  + "HttpConnection)");



        Header[] cookieheaders = getRequestHeaderGroup().getHeaders("Cookie");

        for (int i = 0; i < cookieheaders.length; i++) {

            Header cookieheader = cookieheaders[i];

            if (cookieheader.isAutogenerated()) {

                getRequestHeaderGroup().removeHeader(cookieheader);

            }

        }



        CookieSpec matcher = getCookieSpec(state);

        String host = this.params.getVirtualHost();

        if (host == null) {

            host = conn.getHost();

        }

        // BEGIN IA/HERITRIX CHANGES

        Cookie[] cookies = matcher.match(host, conn.getPort(),

            getPath(), conn.isSecure(), state.getCookiesMap());

        // END IA/HERITRIX CHANGES

        if ((cookies != null) && (cookies.length > 0)) {

            if (getParams().isParameterTrue(HttpMethodParams.SINGLE_COOKIE_HEADER)) {

                // In strict mode put all cookies on the same header

                String s = matcher.formatCookies(cookies);

                getRequestHeaderGroup().addHeader(new Header("Cookie", s, true));

            } else {

                // In non-strict mode put each cookie on a separate header

                for (int i = 0; i < cookies.length; i++) {

                    String s = matcher.formatCookie(cookies[i]);

                    getRequestHeaderGroup().addHeader(new Header("Cookie", s, true));

                }

            }

        }

    }

最后我们怎样在配置文件crawler-beans.cxml配置cookie文件呢,本人做了一个示例

 <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->

 <bean id="cookieStorage" 

   class="org.archive.modules.fetcher.BdbCookieStorage">

  <property name="cookiesLoadFile"><ref bean="cookieInit"/></property> 

 <property name="cookiesSaveFile"><ref bean="cookieSave"/></property>

  <property name="bdb">

        <ref bean="bdb"/>

       </property>

 </bean>

 <bean id="cookieInit" class="org.archive.spring.ConfigFile">

    <property name="name" value="cookie.txt" />

    <property name="path" value="/root/stpl/cookie.txt" />

</bean>

<bean id="cookieSave" class="org.archive.spring.ConfigPath">

    <property name="name" value="cookies_dump.txt" />

    <property name="path" value="/root/stpl/cookies_dump.txt" />

</bean>

cookie.txt文件格式可以参考这段英文注释,这段注释你懂的

* format. Example entry of cookies.txt file:

     * <p>

     * www.archive.org FALSE / FALSE 1311699995 details-visit texts-cralond

     * </p>

     * <p>

     * Each line has 7 tab-separated fields:

     * </p>

     * <ol>

     * <li>DOMAIN: The domain that created and have access to the cookie value.</li>

     * <li>FLAG: A TRUE or FALSE value indicating if hosts within the given

     * domain can access the cookie value.</li>

     * <li>PATH: The path within the domain that the cookie value is valid for.</li>

     * <li>SECURE: A TRUE or FALSE value indicating if to use a secure

     * connection to access the cookie value.</li>

     * <li>EXPIRATION: The expiration time of the cookie value, or -1 for no

     * expiration</li>

     * <li>NAME: The name of the cookie value</li>

     * <li>VALUE: The cookie value</li>

     * </ol>

---------------------------------------------------------------------------

本系列Heritrix 3.1.0 源码解析系本人原创

转载请注明出处 博客园 刺猬的温驯

本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/28/3049673.html

你可能感兴趣的:(Heritrix)