1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
CloseableHttpClienthttp client = HttpClients.createDefault();
HttpGet httpget =
new
HttpGet(
"http://localhost/"
);
CloseableHttpResponse response = httpclient.execute(httpget);
try
{
HttpEntity entity =response.getEntity();
if
(entity !=
null
) {
long
len =entity.getContentLength();
if
(len != -
1
&& len <
2048
) {
System.out.println(EntityUtils.toString(entity));
}
else
{
// Stream contentout
}
}
}
finally
{
response.close();
}
|
1
2
3
4
5
6
7
8
9
10
|
……
public
class
CrawlConfig {
public
static
final
String CRAWL_PATH =
"http://www.163.com"
;
public
static
final
String CRAWL_LIMIT_PATH =
"http://www.163.com"
;
public
static
final
String CRAWL_VISITED_FRONTIER =
"d:\\cache\\hevisited"
;
public
static
final
String CRAWL_UNVISITED_FRONTIER =
"d:\\cache\\heunvisited"
;
public
static
final
String CRAWL_DOWNLOAD_PATH =
"d:\\download\\163\\"
;
public
static
final
int
CRAWL_THREAD_NUM =
6
;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
……
public
class
CrawlUrl
implements
Serializable{
private
static
final
long
serialVersionUID = 79332323432323L;
public
CrawlUrl() {
}
private
String oriUrl;
//原始url
private
String url;
//url地址
public
String getOriUrl() {
return
oriUrl;
}
public
void
setOriUrl(String oriUrl) {
this
.oriUrl = oriUrl;
}
public
String getUrl() {
return
url;
}
public
void
setUrl(String url) {
this
.url = url;
}
}
|
1
2
3
|
public
interface
LinkFilter {
public
boolean
accept(String url);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
……
public
abstract
class
AbstractFrontier {
private
Environment env;
private
static
String CLASS_CATALOG =
"java_class_catalog"
;
protected
StoredClassCatalog javaCatalog;
protected
Database catalogdatabase;
protected
static
Database database =
null
;
protected
String homeDirectory =
null
;
public
AbstractFrontier(String homeDirectory)
throws
DatabaseException,
FileNotFoundException {
this
.homeDirectory = homeDirectory;
System.out.println(
"open environment: "
+ homeDirectory);
//设置环境参数,打开env
EnvironmentConfig envConfig =
new
EnvironmentConfig();
envConfig.setTransactional(
true
);
envConfig.setAllowCreate(
true
);
env =
new
Environment(
new
File(homeDirectory), envConfig);
//设置数据库参数
DatabaseConfig dbConfig =
new
DatabaseConfig();
dbConfig.setTransactional(
true
);
dbConfig.setAllowCreate(
true
);
//打开数据库
catalogdatabase = env.openDatabase(
null
, CLASS_CATALOG, dbConfig);
javaCatalog =
new
StoredClassCatalog(catalogdatabase);
//设置参数
DatabaseConfig dbConfigTe =
new
DatabaseConfig();
dbConfigTe.setTransactional(
true
);
dbConfigTe.setAllowCreate(
true
);
//打开数据库
database = env.openDatabase(
null
,
"URL"
, dbConfig);
}
public
void
close()
throws
DatabaseException {
database.close();
javaCatalog.close();
env.close();
}
protected
abstract
void
put(Object key, Object value);
protected
abstract
Object get(Object key);
protected
abstract
Object delete(Object key);
}
|
1
2
3
4
5
6
|
……
public
interface
Frontier {
public
CrawlUrl getNext()
throws
Exception;
public
boolean
putUrl(CrawlUrl url)
throws
Exception;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
|
……
public
class
BDBFrontier
extends
AbstractFrontier
implements
Frontier{
private
StoredMap pendingUrisDB =
null
;
public
static
int
threads = CrawlConfig.CRAWL_THREAD_NUM;
/**
* Creates a new instance of BDBFrontier.
*
* @param homeDirectory
* @throws DatabaseException
* @throws FileNotFoundException
*/
public
BDBFrontier(String homeDirectory)
throws
DatabaseException,
FileNotFoundException {
super
(homeDirectory);
EntryBinding keyBinding =
new
SerialBinding(javaCatalog, String.
class
);
EntryBinding valueBinding =
new
SerialBinding(javaCatalog, CrawlUrl.
class
);
pendingUrisDB =
new
StoredMap(database, keyBinding, valueBinding,
true
);
}
/**
*
* clearAll:
* 清除数据库
*
* @param 参数
* @return void 返回值
* @throws
*
*/
public
void
clearAll() {
if
(!pendingUrisDB.isEmpty())
pendingUrisDB.clear();
}
/**
* 获得下一条记录
* @see com.fc.frontier.Frontier#getNext()
*/
@Override
public
synchronized
CrawlUrl getNext()
throws
Exception {
CrawlUrl result =
null
;
while
(
true
) {
if
(!pendingUrisDB.isEmpty()) {
Set entrys = pendingUrisDB.entrySet();
Entry<String, CrawlUrl> entry = (Entry<String,
CrawlUrl>) pendingUrisDB.entrySet().iterator().next();
result = entry.getValue();
//下一条记录
delete(entry.getKey());
//删除当前记录
System.out.println(
"get:"
+ homeDirectory + entrys);
return
result;
}
else
{
threads --;
if
(threads >
0
) {
wait();
threads ++;
}
else
{
notifyAll();
return
null
;
}
}
}
}
/**
* 存入url
* @see com.fc.frontier.Frontier#putUrl(com.fc.CrawlUrl)
*/
@Override
public
synchronized
boolean
putUrl(CrawlUrl url)
throws
Exception {
if
(url.getOriUrl() !=
null
&& !url.getOriUrl().equals(
""
)
&& !pendingUrisDB.containsKey(url.getOriUrl()))
{
Set entrys = pendingUrisDB.entrySet();
put(url.getOriUrl(), url);
notifyAll();
System.out.println(
"put:"
+ homeDirectory + entrys);
return
true
;
}
return
false
;
}
public
boolean
contains(Object key) {
if
(pendingUrisDB.containsKey(key))
return
true
;
return
false
;
}
/**
* 存入数据库
* @see com.fc.frontier.AbstractFrontier#put(java.lang.Object, java.lang.Object)
*/
@Override
protected
synchronized
void
put(Object key, Object value) {
pendingUrisDB.put(key, value);
}
/**
* 从数据库取出
* @see com.fc.frontier.AbstractFrontier#get(java.lang.Object)
*/
@Override
protected
synchronized
Object get(Object key) {
return
pendingUrisDB.get(key);
}
/**
* 删除
* @see com.fc.frontier.AbstractFrontier#delete(java.lang.Object)
*/
@Override
protected
synchronized
Object delete(Object key) {
return
pendingUrisDB.remove(key);
}
/**
*
* calculateUrl:
* 对Url进行计算,可以用压缩算法
*
* @param 参数
* @return String 返回值
* @throws
*
*/
private
String calculateUrl(String url) {
return
url;
}
public
static
void
main(String[] strs) {
try
{
BDBFrontier bdbFrontier =
new
BDBFrontier(
"d:\\cache"
);
CrawlUrl url =
new
CrawlUrl();
url.setOriUrl(
"http://www.163.com"
);
bdbFrontier.putUrl(url);
System.out.println(((CrawlUrl)bdbFrontier.getNext()).getOriUrl());
bdbFrontier.close();
}
catch
(Exception e) {
e.printStackTrace();
}
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
……
public
class
RetrievePage {
private
static
String USER_AGENT = "Mozilla/
4.0
(compatible; MSIE
6.0
;
Windows NT
5.1
; SV1; QQDownload
1.7
; .NET CLR
1.1
.
4322
; CIBA; .NET CLR
2.0
.
50727
";
private
static
String DEFAULT_CHARSET =
"GB2312,utf-8;q=0.7,*;q=0.7"
;
private
static
String ACCEPT =
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
;
/**
* 下载文件
* @param path
* @return
* @throws Exception
* @throws IOException
*/
public
static
boolean
downloadPage(String path)
throws
Exception,IOException
{
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpget =
new
HttpGet(path);
httpget.addHeader(
"Accept-Charset"
, DEFAULT_CHARSET);
// httpget.addHeader("Host", host);
httpget.addHeader(
"Accept"
, ACCEPT);
httpget.addHeader(
"User-Agent"
, USER_AGENT);
RequestConfig requestConfig = RequestConfig.custom()
//设置超时
.setSocketTimeout(
1000
)
.setConnectTimeout(
1000
)
.build();
httpget.setConfig(requestConfig);
CloseableHttpResponse response = httpclient.execute(httpget);
try
{
HttpEntity entity = response.getEntity();
StatusLine statusLine = response.getStatusLine();
if
(statusLine.getStatusCode() == HttpStatus.SC_MOVED_PERMANENTLY ||
//如果是转移
statusLine.getStatusCode() == HttpStatus.SC_MOVED_TEMPORARILY ||
statusLine.getStatusCode() == HttpStatus.SC_SEE_OTHER ||
statusLine.getStatusCode() == HttpStatus.SC_TEMPORARY_REDIRECT)
{
Header header = httpget.getFirstHeader(
"location"
);
if
(header !=
null
){
String newUrl = header.getValue();
if
(newUrl ==
null
|| newUrl.equals(
""
))
{
newUrl =
"/"
;
HttpGet redirect =
new
HttpGet(newUrl);
}
}
}
if
(statusLine.getStatusCode() == HttpStatus.SC_OK) {
//成功访问
if
(entity ==
null
) {
throw
new
ClientProtocolException(
"Response contains no content"
);
}
else
{
InputStream instream = entity.getContent();
String filename = getFilenameByUrl(path,entity.getContentType().getValue());
OutputStream outstream =
new
FileOutputStream(CrawlConfig.CRAWL_DOWNLOAD_PATH +
filename);
//存储到磁盘
try
{
//System.out.println(convertStreamToString(instream));
int
tempByte = -
1
;
while
((tempByte = instream.read())>
0
)
{
&
|