// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
String url = "https://\"www.hah.com\"";
// 创建httpget实例
HttpGet httpget = new HttpGet(url);
// 模拟浏览器 ✔
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/60.0");
// 使用代理 IP ✔
// HttpHost proxy = new HttpHost("192.168.1.124", 8080);
RequestConfig config = RequestConfig.custom()
//设置连接超时 ✔
.setConnectTimeout(10000) // 设置连接超时时间 10秒钟
.setSocketTimeout(10000) // 设置读取超时时间10秒钟
.build();
httpget.setConfig(config);
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体(页面代码)
String content = EntityUtils.toString(entity, "utf-8");
//System.out.println(content);
//获取class中值为 ico bz-border的标签
Elements s3 = text.getElementsByAttributeValue("class", "ico bz-border");
//获取a标签
Elements a = text.getElementsByTag("a");
//获取a中的href属性值
String href = a.get(0).attr("href");
//获取a标签中的内容
String ka = a.get(1).text();
(2)、用正则表达式解析获得html标签,属性值等,可以分层获取解析,如果不能解析,可利用string API中的方法截取字符串等例:
//获取class中值为 ico bz-border的标签
Elements s3 = text.getElementsByAttributeValue("class", "ico bz-border");
//获取a标签
Elements a = text.getElementsByTag("a");
//获取a中的href属性值
String href = a.get(0).attr("href");
//获取a标签中的内容
String ka = a.get(1).text();
//用正则解析代码
String a = "]*>([^<]*)";
String aTag = "]*>([^<]*)";
Pattern aPattern = Pattern.compile(aTag);
Matcher aMatcher = aPattern.matcher(content);
//System.out.println("解析a为:"+ amatcher.find());
String reg = "[^\\u4e00-\\u9fa5]";
//判断地址是否规范
String judge = "false";
while (aMatcher.find()) {
String allMessage = aMatcher.group();
//System.out.println("解析a为:"+allMessage);
//使用正则表达式
Pattern pattern = Pattern.compile("[^\\u4E00-\\u9FA5]");
//[\u4E00-\u9FA5]是unicode2的中文区间
Matcher matcher = pattern.matcher(allMessage);
String doubleAddress = matcher.replaceAll("");
// System.out.println("解析doubleAddress为:"+doubleAddress);
//System.out.println("汉字长度为"+matcher.replaceAll(""));
//System.out.println("汉字为:"+doubleAddress);
if(doubleAddress.equals("搜职位")){
judge = "true";
}
}
把数据存储数据库或导出文件即可
//导出文件
public class ExportExcel {
HSSFWorkbook workbook = new HSSFWorkbook();// 创建工作簿对象 中有多个sheet
//显示的导出表的标题
private String title;
//导出表的列名
private String[] rowName;
private List> list = new ArrayList>();
private List
package com.mbyte.easy.admin.controller;
import java.io.IOException;
import java.util.Arrays;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.mbyte.easy.admin.entity.FiveEightCity;
import com.mbyte.easy.admin.service.IFiveEightCityService;
import com.sun.org.apache.xerces.internal.parsers.DOMParser;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import javax.xml.parsers.ParserConfigurationException;
import static java.lang.Integer.parseInt;
public class TestHttp {
/**
* 获取总页数
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
System.out.print("请输入需要查找的公司关键字:");
Scanner in = new Scanner(System.in);
String Keyword=in.next();
System.out.print("请输入需要查找的公司所在城市:");
Scanner in1 = new Scanner(System.in);
String city = in1.next();
TestHttp pinyin11 = new TestHttp();
String cityPI = pinyin11.getPinYin(city);
System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"+cityPI);
String url = "https://"+cityPI+".******.com/job/?key="+Keyword+"&classpolicy=main_null,job_A&final=1&jump=1";
// 创建httpget实例
HttpGet httpget = new HttpGet(url);
// 模拟浏览器 ✔
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/60.0");
// 使用代理 IP ✔
// HttpHost proxy = new HttpHost("192.168.1.124", 8080);
RequestConfig config = RequestConfig.custom()
//设置连接超时 ✔
.setConnectTimeout(10000) // 设置连接超时时间 10秒钟
.setSocketTimeout(10000) // 设置读取超时时间10秒钟
.build();
httpget.setConfig(config);
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体(页面代码)
String content = EntityUtils.toString(entity, "utf-8");
//System.out.println(content);
//用正则解析代码
String a = "]*>([^<]*)";
String aTag = "]*>([^<]*)";
Pattern aPattern = Pattern.compile(aTag);
Matcher aMatcher = aPattern.matcher(content);
//System.out.println("解析a为:"+ amatcher.find());
String reg = "[^\\u4e00-\\u9fa5]";
//判断地址是否规范,如果没有搜职位即是主页面
String judge = "false";
while (aMatcher.find()) {
String allMessage = aMatcher.group();
//System.out.println("解析a为:"+allMessage);
//使用正则表达式
Pattern pattern = Pattern.compile("[^\\u4E00-\\u9FA5]");
//[\u4E00-\u9FA5]是unicode2的中文区间
Matcher matcher = pattern.matcher(allMessage);
String doubleAddress = matcher.replaceAll("");
// System.out.println("解析doubleAddress为:"+doubleAddress);
//System.out.println("汉字长度为"+matcher.replaceAll(""));
//System.out.println("汉字为:"+doubleAddress);
if(doubleAddress.equals("搜职位")){
judge = "true";
}
}
//获取页面信息的总页数
String iTag = "]*>([^<]*)";
Pattern iPattern = Pattern.compile(iTag);
Matcher iMatcher = iPattern.matcher(content);
String allPage = "-1";
while (iMatcher.find()){
String iMessage = iMatcher.group();
//System.out.println("解析i为:"+ iMessage);
if(iMessage.length()>=27 && iMessage.length()<=28){
Pattern pattern = Pattern.compile("[^\\u4E00-\\u9FA5]");
//[\u4E00-\u9FA5]是unicode2的中文区间
Matcher matcher = pattern.matcher(iMessage);
String haveChinese = matcher.replaceAll("");
if(haveChinese.length()==0){
//System.out.println("解析i为:"+ iMessage);
String regEx3 = "[0-9]";
allPage = matchResult(Pattern.compile(regEx3),iMessage);
//System.out.println("解析allPage为:"+ allPage);
}
}
}
//System.out.println("解析allPage为:"+ allPage);
//System.out.println("judge:"+ judge);
TestHttp testHttp = new TestHttp();
int page = parseInt(allPage);
//加入判断看信息是否正确,且信息有几页,采用不同的方式调用爬取方法
if(judge.equals("true")){
if(page == -1){
System.out.println("没有与\""+Keyword+"\"关键字匹配的信息!");
}else if(page == 1){
testHttp.branchPage(page,Keyword);
}else{
for(int i = 1 ; i <= page ; i++){
testHttp.branchPage(i,Keyword);
}
}
}else{
System.out.println("输入的\""+city+"\"地址不存在!");
}
}
/**
* 获取字符串中的数字
* @param p
* @param str
* @return
*/
public static String matchResult(Pattern p,String str)
{
StringBuilder sb = new StringBuilder();
Matcher m = p.matcher(str);
while (m.find())
for (int i = 0; i <= m.groupCount(); i++)
{
sb.append(m.group());
}
return sb.toString();
}
/**
* 爬取信息
* @param page
* @param keyword
* @throws IOException
*/
public void branchPage(int page,String keyword) throws IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
String url = "https://bd.58.com/job/?key=" + keyword + "&classpolicy=main_null,job_A&final=1&jump=1&page=" + page;
// 创建httpget实例
HttpGet httpget = new HttpGet(url);
// 模拟浏览器 ✔
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/60.0");
// 使用代理 IP ✔
// HttpHost proxy = new HttpHost("192.168.1.124", 8080);
RequestConfig config = RequestConfig.custom()
//设置连接超时 ✔
.setConnectTimeout(10000) // 设置连接超时时间 10秒钟
.setSocketTimeout(10000) // 设置读取超时时间10秒钟
.build();
httpget.setConfig(config);
// 设置爬取时间间隔 5s
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体(页面代码)
String content = EntityUtils.toString(entity, "utf-8");
//System.out.println(content);
System.out.println("==================================================================================================");
// String a = "]*>([^<]*)";
//用正则解析代码,解析a标签
String aTag = "]*>([^<]*)";
Pattern aPattern = Pattern.compile(aTag);
Matcher aMatcher = aPattern.matcher(content);
//System.out.println("解析a为:"+ amatcher.find());
String reg = "[^\\u4e00-\\u9fa5]";
while (aMatcher.find()) {
String allMessage = aMatcher.group();
if (allMessage.length() >= 235 && allMessage.length() <= 262) {
//使用正则表达式,用StringAPI获取想要信息
Pattern pattern = Pattern.compile("[^\\u4E00-\\u9FA5]");
//[\u4E00-\u9FA5]是unicode2的中文区间
Matcher matcher = pattern.matcher(allMessage);
String doubleAddress = matcher.replaceAll("");
//System.out.println("汉字长度为"+matcher.replaceAll(""));
if (doubleAddress.length() > 8) {
String href = allMessage.substring(10, 43);
if(href.indexOf("https") != -1){
String hrefs = href.replaceAll(" ", "");
//System.out.println("hrefs:" + hrefs);
if(href.length() > hrefs.length()){
String hrefUrl = hrefs.substring(0, 28);
if(hrefUrl.indexOf("\"") != -1){
String hrefUrls = hrefUrl.substring(0,hrefUrl.length()-1);
System.out.println("链接:" + hrefUrls);
int length = doubleAddress.length() / 2;
String address = doubleAddress.substring(0, length);
System.out.println("地址:" + address);
}
}else{
if(href.indexOf("\"") != -1){
String hrefUrls = href.substring(0,href.length()-1);
System.out.println("链接:" + hrefUrls);
int length = doubleAddress.length() / 2;
String address = doubleAddress.substring(0, length);
System.out.println("地址:" + address);
}else{
System.out.println("链接:" + href);
int length = doubleAddress.length() / 2;
String address = doubleAddress.substring(0, length);
System.out.println("地址:" + address);
}
}
}
}
}
}
}
/**
* 将汉字转换成拼音
* @param inputString
* @return
*/
public static String getPinYin(String inputString) {
HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
format.setVCharType(HanyuPinyinVCharType.WITH_V);
char[] input = inputString.trim().toCharArray();
String output = "";
try {
for (int i = 0; i < input.length; i++) {
if (java.lang.Character.toString(input[i]).matches("[\\u4E00-\\u9FA5]+")) {
String[] temp = PinyinHelper.toHanyuPinyinStringArray(input[i], format);
output += temp[0];
} else
output += java.lang.Character.toString(input[i]);
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
return output;
}
}