htmlunit 是一款开源的java 页面分析工具,读取页面后,可以有效的使用htmlunit分析页面上的内容。项目可以模拟浏览器运行,被誉为java浏览器的开源实现。这个没有界面的浏览器,运行速度也是非常迅速的。
相关文件下载地址:
http://sourceforge.net/projects/htmlunit/files/
http://jaist.dl.sourceforge.net/project/htmlunit/htmlunit/2.15/htmlunit-2.15-bin.zip
我的需求是使用百度的高级新闻搜索,抓取指定站点新闻
手动搜索的设置如图所示
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
package
com.html580;
import
java.io.IOException;
import
java.net.MalformedURLException;
import
java.util.List;
import
com.gargoylesoftware.htmlunit.ElementNotFoundException;
import
com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import
com.gargoylesoftware.htmlunit.WebClient;
import
com.gargoylesoftware.htmlunit.html.HtmlForm;
import
com.gargoylesoftware.htmlunit.html.HtmlHiddenInput;
import
com.gargoylesoftware.htmlunit.html.HtmlPage;
import
com.gargoylesoftware.htmlunit.html.HtmlRadioButtonInput;
import
com.gargoylesoftware.htmlunit.html.HtmlSelect;
import
com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import
com.gargoylesoftware.htmlunit.html.HtmlTextInput;
/**
* @description 抓取百度搜索结果
* @author html580
* @site http://www.html580.com
* @version 2014-7-23
*/
public
class
BaiduSpider {
public
static
void
main(String[] args) {
try
{
final
WebClient webclient =
new
WebClient();
//搜索按钮
final
HtmlForm form = htmlpage.getFormByName(
"f1"
);
final
HtmlSubmitInput button = form.getInputByValue(
"百度一下"
);
//搜索结果-关键词
final
HtmlTextInput textField = form.getInputByName(
"q1"
);
textField.setValueAttribute(
"HTML我帮您"
);
//分页条数
final
HtmlSelect htmlSelet=form.getSelectByName(
"rn"
);
htmlSelet.setDefaultValue(
"10"
);
//网页的时间
final
HtmlSelect htmlSeletlm=form.getSelectByName(
"rn"
);
htmlSeletlm.setDefaultValue(
"0"
);
//语言
final
List<HtmlRadioButtonInput> radioButtonCts = form.getRadioButtonsByName(
"ct"
);
radioButtonCts.get(
0
).setChecked(
true
);
radioButtonCts.get(
1
).setChecked(
false
);
radioButtonCts.get(
2
).setChecked(
false
);
//文档格式
final
HtmlSelect htmlSeletft=form.getSelectByName(
"ft"
);
htmlSeletft.setDefaultValue(
""
);
//关键词位置
final
List<HtmlRadioButtonInput> radioButtonq5s = form.getRadioButtonsByName(
"q5"
);
radioButtonq5s.get(
0
).setChecked(
true
);
radioButtonq5s.get(
1
).setChecked(
false
);
radioButtonq5s.get(
2
).setChecked(
false
);
//站内搜索 限定要搜索指定的网站
final
HtmlTextInput htmlTextInputq6 = form.getInputByName(
"q6"
);
htmlTextInputq6.setDefaultValue(
"html580.com"
);
//隐藏值
final
HtmlHiddenInput hiddenInputtn = form.getInputByName(
"tn"
);
hiddenInputtn.setDefaultValue(
"baiduadv"
);
final
HtmlPage page2 = button.click();
String result = page2.asXml();
System.out.println(result);
webclient.closeAllWindows();
}
catch
(FailingHttpStatusCodeException e) {
e.printStackTrace();
}
catch
(MalformedURLException e) {
e.printStackTrace();
}
catch
(ElementNotFoundException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
}
}
|
转自:http://www.html580.com/11591/baiduspider