上文分析了Heritrix3.1.0系统的对请求认证机制的封装,本文接下来分析Heritrix3.1.0系统对cookies的处理的封装
Heritrix3.1.0系统提供了CookieStorage接口,用于提供cookies的存储
CookieStorage接口很简单,声明了保存cookies对象的Map容器的方法和获取cookies对象的Map容器的方法
public interface CookieStorage extends Lifecycle { SortedMap<String,Cookie> getCookiesMap(); void saveCookiesMap(Map<String,Cookie> map); }
抽象类AbstractCookieStorage实现了CookieStorage接口,用于为具体实现类提供公用模板
public abstract class AbstractCookieStorage implements CookieStorage, Lifecycle, // InitializingBean, Closeable { final private static Logger LOGGER = Logger.getLogger(AbstractCookieStorage.class.getName()); //cookies配置文件(用于加载) protected ConfigFile cookiesLoadFile = null; public ConfigFile getCookiesLoadFile() { return cookiesLoadFile; } public void setCookiesLoadFile(ConfigFile cookiesLoadFile) { this.cookiesLoadFile = cookiesLoadFile; } //cookies文件路径(用于保存) protected ConfigPath cookiesSaveFile = null; public ConfigPath getCookiesSaveFile() { return cookiesSaveFile; } public void setCookiesSaveFile(ConfigPath cookiesSaveFile) { this.cookiesSaveFile = cookiesSaveFile; } boolean isRunning = false; /** * 初始化 */ @Override public void start() { if(isRunning()) { return; } SortedMap<String,Cookie> cookies = prepareMap(); if (getCookiesLoadFile()!=null) { //从cookies配置文件加载cookies loadCookies(getCookiesLoadFile(), cookies); } isRunning = true; } @Override public boolean isRunning() { return isRunning; } @Override public void stop() { isRunning = false; } /** * 初始化SortedMap<String,Cookie> 由具体子类实现 * @return */ protected abstract SortedMap<String,Cookie> prepareMap(); /** * 从Reader reader对象加载cookies * @param reader * @param cookies */ public static void loadCookies(Reader reader, SortedMap<String, Cookie> cookies) { BufferedReader br = new BufferedReader(reader); try { String line; int lineNo = 1; while ((line = br.readLine()) != null) { if (!line.matches("\\s*(?:#.*)?")) { // skip blank links and comments String[] tokens = line.split("\\t"); if (tokens.length == 7) { long epochSeconds = Long.parseLong(tokens[4]); Date expirationDate = (epochSeconds >= 0 ? new Date(epochSeconds * 1000) : null); Cookie cookie = new Cookie(tokens[0], tokens[5], tokens[6], tokens[2], expirationDate, Boolean.valueOf(tokens[3]).booleanValue()); cookie.setDomainAttributeSpecified(Boolean.valueOf(tokens[1]).booleanValue()); LOGGER.fine("Adding cookie: domain " + cookie.getDomain() + " cookie " + cookie.toExternalForm()); cookies.put(cookie.getSortKey(), cookie); } else { LOGGER.warning("cookies input line " + lineNo + " invalid, expected 7 tab-delimited tokens"); } } lineNo++; } } catch (IOException e) { LOGGER.log(Level.WARNING,e.getMessage(), e); } } /** * 从配置文件加载SortedMap<String, Cookie> cookies * @param file * @param cookies */ protected static void loadCookies(ConfigFile file, SortedMap<String, Cookie> cookies) { Reader reader = null; try { reader = file.obtainReader(); loadCookies(reader, cookies); } finally { IOUtils.closeQuietly(reader); } } public static void loadCookies(String cookiesFile, SortedMap<String,Cookie> result) { // Do nothing if cookiesFile is not specified. if (cookiesFile == null || cookiesFile.length() <= 0) { return; } FileReader reader = null; try { reader = new FileReader(cookiesFile); loadCookies(reader, result); } catch (FileNotFoundException e) { LOGGER.log(Level.WARNING,"Could not find file: " + cookiesFile, e); } finally { IOUtils.closeQuietly(reader); } } /** * 保存map容器中的cookies到文件 * @param saveCookiesFile * @param cookies */ public static void saveCookies(String saveCookiesFile, Map<String,Cookie> cookies) { // Do nothing if cookiesFile is not specified. if (saveCookiesFile == null || saveCookiesFile.length() <= 0) { return; } FileOutputStream out = null; try { out = new FileOutputStream(new File(saveCookiesFile)); String tab ="\t"; out.write("# Heritrix Cookie File\n".getBytes()); out.write("# This file is the Netscape cookies.txt format\n\n".getBytes()); for (Cookie cookie: cookies.values()) { // Guess an initial size MutableString line = new MutableString(1024 * 2); line.append(cookie.getDomain()); line.append(tab); line.append(cookie.isDomainAttributeSpecified() ? "TRUE" : "FALSE"); line.append(tab); line.append(cookie.getPath()); line.append(tab); line.append(cookie.getSecure() ? "TRUE" : "FALSE"); line.append(tab); line.append(cookie.getExpiryDate() != null ? cookie.getExpiryDate().getTime() / 1000 : -1); line.append(tab); line.append(cookie.getName()); line.append(tab); line.append(cookie.getValue() != null ? cookie.getValue() : ""); line.append("\n"); out.write(line.toString().getBytes()); } } catch (IOException e) { LOGGER.log(Level.SEVERE, "Unable to write " + saveCookiesFile, e); } finally { IOUtils.closeQuietly(out); } } /** * 具体子类实现 */ @Override public abstract SortedMap<String,Cookie> getCookiesMap(); /** * 保存map容器中的cookies */ @Override public void saveCookiesMap(Map<String, Cookie> map) { //抽象方法由具体子类实现 innerSaveCookiesMap(map); if (getCookiesSaveFile()!=null) { saveCookies(getCookiesSaveFile().getFile().getAbsolutePath(), map); } } /** * 具体子类实现 * @param map */ protected abstract void innerSaveCookiesMap(Map<String,Cookie> map); @Override public void close() throws IOException { } }
Heritrix3.1.0提供了两个继承类,分别为BdbCookieStorage和SimpleCookieStorage,前者将cookies保存在BDB数据库,后者保存在Map对象里面
BdbCookieStorage类的相关方法如下
protected BdbModule bdb; @Autowired public void setBdbModule(BdbModule bdb) { this.bdb = bdb; } /** are we a checkpoint recovery? (in which case, reuse stored cookie data?) */ boolean isCheckpointRecovery = false; public static String COOKIEDB_NAME = "http_cookies"; private transient Database cookieDb; private transient StoredSortedMap<String,Cookie> cookies; public BdbCookieStorage() { } protected SortedMap<String,Cookie> prepareMap() { try { StoredClassCatalog classCatalog = bdb.getClassCatalog(); BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig(); dbConfig.setTransactional(false); dbConfig.setAllowCreate(true); cookieDb = bdb.openDatabase(COOKIEDB_NAME, dbConfig, isCheckpointRecovery); cookies = new StoredSortedMap<String,Cookie>( cookieDb, new StringBinding(), new SerialBinding<Cookie>(classCatalog,Cookie.class), true); return cookies; } catch (DatabaseException e) { throw new RuntimeException(e); } } public SortedMap<String, Cookie> getCookiesMap() { // assert cookies != null : "cookie map not set up"; return cookies; } protected void innerSaveCookiesMap(Map<String, Cookie> map) { }
SimpleCookieStorage类与之类似,不在这里贴出来了
这里需要注意的是,Heritrix3.1.0系统改写了HttpClient组件的Cookie类,逻辑与HttpClient组件的Cookie类类似
那么Heritrix3.1.0系统怎样将CookieStorage接口实现类获取的SortedMap<String, Cookie>容器中的Cookies添加在HttpClient组件的相关对象呢?
Heritrix3.1.0系统还改写了HttpClient组件的HttpState类,添加了设置SortedMap cookiesMap对象的方法,相关方法如下
private SortedMap cookiesMap = new ConcurrentSkipListMap(); // START IA/HERITRIX ADDITIONS /** * Returns a sorted map of {@link Cookie cookies} that this HTTP * state currently contains. * * Any operations on this map should be synchronized with respect * to this HttpState instance. * * @return sorter map of {@link Cookie cookies} */ public SortedMap getCookiesMap() { return cookiesMap; } /** * Replace the standard sorted map with an external implemenations * (such as one backed by persistent store, like BDB's StoredSortedMap.) * * @param map alternate sorted map to use to store cookies */ public void setCookiesMap(SortedMap map) { this.cookiesMap = map; } // END IA/HERITRIX ADDITIONS
同时HttpMethodBase对象相关方法里面从HttpState state对象获取Cookies对象也做了相应的改写
/** * Generates <tt>Cookie</tt> request headers for those {@link Cookie cookie}s * that match the given host, port and path. * * @param state the {@link HttpState state} information associated with this method * @param conn the {@link HttpConnection connection} used to execute * this HTTP method * * @throws IOException if an I/O (transport) error occurs. Some transport exceptions * can be recovered from. * @throws HttpException if a protocol exception occurs. Usually protocol exceptions * cannot be recovered from. */ protected void addCookieRequestHeader(HttpState state, HttpConnection conn) throws IOException, HttpException { LOG.trace("enter HttpMethodBase.addCookieRequestHeader(HttpState, " + "HttpConnection)"); Header[] cookieheaders = getRequestHeaderGroup().getHeaders("Cookie"); for (int i = 0; i < cookieheaders.length; i++) { Header cookieheader = cookieheaders[i]; if (cookieheader.isAutogenerated()) { getRequestHeaderGroup().removeHeader(cookieheader); } } CookieSpec matcher = getCookieSpec(state); String host = this.params.getVirtualHost(); if (host == null) { host = conn.getHost(); } // BEGIN IA/HERITRIX CHANGES Cookie[] cookies = matcher.match(host, conn.getPort(), getPath(), conn.isSecure(), state.getCookiesMap()); // END IA/HERITRIX CHANGES if ((cookies != null) && (cookies.length > 0)) { if (getParams().isParameterTrue(HttpMethodParams.SINGLE_COOKIE_HEADER)) { // In strict mode put all cookies on the same header String s = matcher.formatCookies(cookies); getRequestHeaderGroup().addHeader(new Header("Cookie", s, true)); } else { // In non-strict mode put each cookie on a separate header for (int i = 0; i < cookies.length; i++) { String s = matcher.formatCookie(cookies[i]); getRequestHeaderGroup().addHeader(new Header("Cookie", s, true)); } } } }
最后我们怎样在配置文件crawler-beans.cxml配置cookie文件呢,本人做了一个示例
<!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP --> <bean id="cookieStorage" class="org.archive.modules.fetcher.BdbCookieStorage"> <property name="cookiesLoadFile"><ref bean="cookieInit"/></property> <property name="cookiesSaveFile"><ref bean="cookieSave"/></property> <property name="bdb"> <ref bean="bdb"/> </property> </bean> <bean id="cookieInit" class="org.archive.spring.ConfigFile"> <property name="name" value="cookie.txt" /> <property name="path" value="/root/stpl/cookie.txt" /> </bean> <bean id="cookieSave" class="org.archive.spring.ConfigPath"> <property name="name" value="cookies_dump.txt" /> <property name="path" value="/root/stpl/cookies_dump.txt" /> </bean>
cookie.txt文件格式可以参考这段英文注释,这段注释你懂的
* format. Example entry of cookies.txt file: * <p> * www.archive.org FALSE / FALSE 1311699995 details-visit texts-cralond * </p> * <p> * Each line has 7 tab-separated fields: * </p> * <ol> * <li>DOMAIN: The domain that created and have access to the cookie value.</li> * <li>FLAG: A TRUE or FALSE value indicating if hosts within the given * domain can access the cookie value.</li> * <li>PATH: The path within the domain that the cookie value is valid for.</li> * <li>SECURE: A TRUE or FALSE value indicating if to use a secure * connection to access the cookie value.</li> * <li>EXPIRATION: The expiration time of the cookie value, or -1 for no * expiration</li> * <li>NAME: The name of the cookie value</li> * <li>VALUE: The cookie value</li> * </ol>
---------------------------------------------------------------------------
本系列Heritrix 3.1.0 源码解析系本人原创
转载请注明出处 博客园 刺猬的温驯
本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/28/3049673.html