爬虫对我们来说是一个既爱又恨的家伙。我记得我大学时期,有个朋友学会了python的爬虫之后,整天就去爬各种网站,不亦乐乎。在工程中,经常需要做爬虫相关的业务。爬虫一般是和多线程挂钩的,今天先详细介绍一个简单版,后续增加一个线程池版本的。
实现的功能:从文件中读取url列表,然后把每个页面的结果都存储在一个文件中,同时去掉html中的标签和javascript代码。
首先,创建一个maven工程,然后引入依赖
<dependency>
<groupId>net.sourceforge.htmlunitgroupId>
<artifactId>htmlunitartifactId>
<version>2.32version>
dependency>
这个包是专门用于爬取页面的。
//web页面获取的实体类
public class WebEntity {
private WebClient webclient; //模拟浏览器对象
private String url; //请求url
public WebEntity(String url) {
WebClient webclient = new WebClient(BrowserVersion.CHROME);
webclient.getOptions().setJavaScriptEnabled(false);
webclient.getOptions().setCssEnabled(false);
webclient.getOptions().setUseInsecureSSL(false);
webclient.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0");
this.webclient = webclient;
this.url = url;
}
//获取页面
public HtmlPage executeReq() {
int time = 1;
while (time <= 5) {
try {
return webclient.getPage(url);
} catch (IOException e) {
}
time++;
}
return null;
}
//打印信息
@Override
public String toString() {
return String.format("url = %s", url);
}
//get、set方法
public WebClient getWebclient() {
return webclient;
}
public void setWebclient(WebClient webclient) {
this.webclient = webclient;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
//爬取页面
public class Craw {
private static volatile Craw singleton;
private Craw(){
}
public static Craw getInstance() {
if (singleton == null) {
synchronized (Craw.class) {
if (singleton == null) {
singleton = new Craw();
return singleton;
}
}
}
return singleton;
}
public HtmlPage parsePage(WebEntity webEntity){
HtmlPage page = null;
try{
page = webEntity.executeReq();
}catch (Exception e){
System.err.println("获取页面失败");
}
return page;
}
}
//主类,包含了从文件中读取和写入东西
public class Pachong {
static void outfile(File file, HtmlPage page){
FileOutputStream fileOutputStream = null;
BufferedOutputStream bufferedOutputStream = null;
try {
if (file != null){
fileOutputStream = new FileOutputStream(file);
bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
if (page != null){
String ans = page.asXml();
String anstrue = LabelUtil.handleHtmlLabel(ans);
bufferedOutputStream.write(anstrue.getBytes());
}
bufferedOutputStream.flush();
bufferedOutputStream.close();
fileOutputStream.close();
}
}catch (IOException e){
e.printStackTrace();
}
}
public static void main(String args[]){
String filename = "d://test/in.txt";
File file = new File(filename);
InputStream inputStream = null;
try {
inputStream = new FileInputStream(file);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
String line = null;
int cnt = 0;
while ((line = bufferedReader.readLine()) != null){
WebEntity webEntity = new WebEntity(line);
Craw craw = Craw.getInstance();
HtmlPage page = craw.parsePage(webEntity);
if (page != null){
String filenametmp = "d://test/"+cnt + ".txt";
File file1 = new File(filenametmp);
if (!file1.exists()){
file1.createNewFile();
outfile(file1,page);
}else {
outfile(file1,page);
}
}
cnt ++ ;
}
inputStream.close();
bufferedReader.close();
}catch (IOException e){
e.printStackTrace();
}
}
}
//去掉页面中的html标签和js代码
public class LabelUtil {
public static String handleHtmlLabel(String html){
String noHTMLString = "";
html = html.replaceAll("&", "&");
Matcher m = Pattern
.compile("(\\d+);", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
.matcher(html);
boolean b = false;
int i = 0;
while (m.find()) {
if (i > 500) {
System.out.println(i);
}
i++;
html = html.replace("" + m.group(1) + ";", (char) Integer.parseInt(m.group(1)) + "");
b = true;
}
if (!b) {
m = Pattern
.compile("([\\da-f]+);",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
.matcher(html);
int j = 0;
while (m.find()) {
if (j > 500) {
System.out.println(j);
}
j++;
html = html.replaceAll("[x|X]" + m.group(1) + ";", (char) Integer.parseInt(m.group(1), 16) + "");
}
}
String scl = "";//9
int indexl = -1;
indexl = html.indexOf(scl);
long mm = html.length();
while (indexl != -1){
int indexr = -1;
indexr = html.indexOf(scr);
if (indexl != 0){
String x = html.substring(0,indexl);
int n = html.length();
if (indexr != n-9 && indexr != -1) { ;
String y = html.substring(indexr+9,n-1);
html = x+y;
}else if (indexr == n-9 || indexr == -1){
html= x;
}
}else {
int n = html.length();
if (indexr != n-9 && indexr != -1){
String y = html.substring(indexr+9,n-1);
html = y;
}else if(indexr == n-9){
html = "";
}else if(indexr == -1){
html = "";
}
}
indexl = -1;
indexl = html.indexOf(scl);
}
noHTMLString = html.replaceAll("<\\s*(?:br|Br|BR|bR|div|DIV|Div|p|P|td|TD|Td)\\s*(?:[^>])*\\s*>", "")
.replaceAll("", "").replaceAll(" ", "").replaceAll("\\<.*?\\>", "")
.replaceAll("&(?:g|l)t", "");
String x = "";
Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
Matcher matcher = pattern.matcher(noHTMLString);
x = matcher.replaceAll("");
return x.trim();
}
}