package com.fish.net;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Properties;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import com.fish.framework.constant.Charsets;
import com.fish.framework.constant.ProperKeys;
import com.fish.util.ReadToWriteIO;
import com.fish.util.StringUtil;
public class GetHtml {
private static ExecutorService executorService = null;
private static Semaphore semaphore = null;
private static PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
private static NetworkManagement nm = new NetworkManagement();
private static int threadPool = 100;
private static int maxTotal = 200;
private static int successCount = 0;
private static int tableEverySize = 100;
private static ConcurrentHashMap baseDataList = new ConcurrentHashMap();
private static Pattern tablePattern;
private static String get_stock_quotation_path;
/**
* 加载配置文件
*/
static {
Properties proper = new Properties();
try {
proper.load(new FileInputStream(Thread.currentThread().getContextClassLoader().getResource("proper.properties").getPath()));
tablePattern = Pattern.compile(proper.getProperty(ProperKeys.REGEX_STOCK_QUOTATION));
get_stock_quotation_path = proper.getProperty(ProperKeys.GET_STOCK_QUOTATION_PATH);
executorService = Executors.newFixedThreadPool(threadPool);//创建线程
semaphore = new Semaphore(threadPool);//用来控制同时访问特定资源的线程数量
nm.isSpontaneousNotice(false);
} catch (Exception e) {
throw new RuntimeException("properties load fail!");
}
}
public static void main(String[] args) throws InterruptedException, IOException {
execution();
}
private static void execution() throws InterruptedException, IOException{
CloseableHttpClient httpclient = null;
try {
cm.setMaxTotal(maxTotal);
cm.setDefaultMaxPerRoute(20);
httpclient = HttpClients.custom().setConnectionManager(cm).build();
long startl = System.currentTimeMillis();
executionDetail(httpclient);
long endl = System.currentTimeMillis();
System.out.println(tableEverySize + "条线程执行时间共:"+ ((endl - startl) / 1000) + "秒\t");
} finally {
executorService.shutdown();
cm.close();
if (httpclient != null) {
try {
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 初次详细数据抓取
*/
public static void executionDetail(CloseableHttpClient httpclient)
throws InterruptedException, IOException {
File file = new File("E:\\test\\html\\stock\\symbol_type.csv");
Vector vList = ReadToWriteIO.readStockSymbolToFile(file, Charsets.UTF8);
int vListSize = vList.size();
System.out.println(vListSize + " " + tableEverySize);
int start = 0;
int len = vListSize % tableEverySize == 0 ? vListSize / tableEverySize
: vListSize / tableEverySize + 1;
boolean hasMantissa = vListSize % tableEverySize == 0 ? false : true;
int mantissaSize = vListSize % tableEverySize;
int dataCount = 0;
System.out.println("共" + vListSize + "条数据,将分" + len + "批经行获取,开始抓取....");
for (int i = 0; i < len; i++) {
successCount = 0;
if ((i + 1) % 100 == 0) {
cm.close();
httpclient.close();
cm = new PoolingHttpClientConnectionManager();
httpclient = HttpClients.custom().setConnectionManager(cm)
.build();
}
cm.closeExpiredConnections();
System.out.println("\t\t发送第" + (i + 1) + "批次content请求");
Thread.sleep(500);
if ((i + 1) % 100 == 0) {
cm.close();
httpclient.close();
cm = new PoolingHttpClientConnectionManager();
httpclient = HttpClients.custom().setConnectionManager(cm)
.build();
}
if (hasMantissa && i == len - 1) {
// 尾数部分
for (int j = 0; j < mantissaSize; j++) {
dataCount++;
String symbol_type = vList.get(start + j);
if (symbol_type != null && symbol_type != "") {
baseDataList.put(symbol_type, new DetailData(symbol_type));
executorService.execute(new GetDetailThread(httpclient,
new HttpGet(get_stock_quotation_path + symbol_type),
symbol_type));
try {
Thread.sleep(50);
} catch (Exception e) {
e.printStackTrace();
}
}
}
} else {
// 整体部分
for (int j = 0; j < tableEverySize; j++) {
dataCount++;
String symbol_type = vList.get(start + j);
if (symbol_type != null && symbol_type != "") {
baseDataList.put(symbol_type, new DetailData(symbol_type));
executorService.execute(new GetDetailThread(httpclient,
new HttpGet(get_stock_quotation_path + symbol_type),
symbol_type));
try {
Thread.sleep(50);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
int time = 0;
synchronized (GetHtml.class) {
do {
if (++time == 10) {
executorService.shutdownNow();
executorService = Executors
.newFixedThreadPool(threadPool);
semaphore.drainPermits();
semaphore = new Semaphore(threadPool);
Thread.sleep(2000);
break;
}
if (semaphore.availablePermits() != threadPool) {
try {
Thread.sleep(1000);
System.out.println("\t\t已等待" + time + "秒,已获取有效许可"
+ semaphore.availablePermits() + "个");
} catch (Exception e) {
e.printStackTrace();
}
} else {
break;
}
} while (true);
}
if (successCount == 0) {
try {
System.out.println("&&&&& 重新实例化HttpClient &&&&&");
cm.close();
httpclient.close();
cm = new PoolingHttpClientConnectionManager();
cm.setMaxTotal(maxTotal);
cm.setDefaultMaxPerRoute(20);
httpclient = HttpClients.custom().setConnectionManager(cm)
.build();
Thread.sleep(10000);
} catch (Exception e) {
System.out.println("&&&&& catch 正在重新实例化.... &&&&&");
}
}
System.out.println("---- sp.availablePermits():"
+ semaphore.availablePermits());
System.out.println("\t\t结束第" + (i + 1) + "批次content请求");
if (dataCount % 500 == 0) {
try {
Thread.sleep(2000);
writeBaseDataToFile(baseDataList);
} catch (Exception e) {
e.printStackTrace();
}
baseDataList.clear();
}
start += tableEverySize;
}
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace();
}
writeBaseDataToFile(baseDataList);
baseDataList.clear();
}
private static void writeBaseDataToFile(
ConcurrentHashMap baseDataList2) {
String writePath = "E:\\test\\html\\stock\\test2";
File f = new File(writePath);
if (!f.exists()) {
f.mkdirs();
}
String okFileName = writePath + "/" + "okquotation.csv";
File okFile = new File(okFileName);
if (!okFile.exists()) {
try {
okFile.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
}
String noFileName = writePath + "/" + "noquotation.csv";
File noFile = new File(noFileName);
if (!noFile.exists()) {
try {
noFile.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
}
writeToFile(baseDataList2, okFile, noFile, Charsets.UTF8);
}
/**
* 详细数据写入文件
*/
public synchronized static void writeToFile(
ConcurrentHashMap bdList, File okFile,
File noFile, String encoding) {
BufferedWriter bufOk = null;
BufferedWriter bufNo = null;
try {
bufOk = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(okFile, true), encoding));
bufNo = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(noFile, true), encoding));
synchronized (bdList) {
Set set = bdList.keySet();
int i = 0;
for (String s : set) {
i++;
String str = bdList.get(s).getValue();
if (str != null && str != "") {
bufOk.write(str);
} else {
bufNo.write(s);
bufNo.newLine();
}
if (i % 1000 == 0) {
bufNo.flush();
}
}
}
bufOk.flush();
bufOk.close();
bufNo.flush();
bufNo.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (bufOk != null) {
try {
bufOk.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (bufNo != null) {
try {
bufNo.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
static class GetDetailThread implements Runnable{
private final CloseableHttpClient httpClient;
private final HttpContext context;
private final HttpGet httpget;
private final String id;
public GetDetailThread(CloseableHttpClient httpClient, HttpGet httpget,
String id) {
this.httpClient = httpClient;
this.context = new BasicHttpContext();
this.httpget = httpget;
this.id = id;
}
@Override
public void run() {
CloseableHttpResponse response = null;
synchronized (this) {
try {
semaphore.acquire();
} catch (InterruptedException e1) {
e1.printStackTrace();
}
try {
response = httpClient.execute(httpget, context);
int status = response.getStatusLine().getStatusCode();
if (status >= 200 && status < 300) {
HttpEntity httpEntity = response.getEntity();
if (httpEntity != null) {
String cont = trimLineToString(httpEntity, "utf-8");
System.out.println("-------html-------" + cont);
EntityUtils.consume(httpEntity);
Matcher matcher = tablePattern.matcher(cont);
if (matcher.find()) {
String info = matcher.group(1);
String[] strs = new String[33];
strs = StringUtil.split(info, ",");
StringBuffer buff = new StringBuffer();
buff.append("\"" + strs[1] + "\",");
buff.append("\"" + (strs[28].split(" "))[0] + "\",");
buff.append("\"" + strs[5] + "\",");
buff.append("\"" + strs[10] + "\",");
buff.append("\"" + strs[11] + "\",");
buff.append("\"" + strs[13] + "\",");
buff.append("\"" + strs[9] + "\",");
buff.append("\"" + strs[8] + "\",");
buff.append("\"" + strs[3] + "\",");
buff.append("\"" + strs[4] + "\",");
buff.append("\"" + strs[6] + "\",");
buff.append("\"" + strs[7] + "\",");
buff.append("\"" + strs[23] + "\",");
buff.append("\"" + strs[22] + "\",");
buff.append("\"" + strs[24] + "\",");
buff.deleteCharAt(buff.lastIndexOf(","));
buff.append("\r\n");
successCount++;
baseDataList.get(id).setValue(
"\"" + id + "\"," + buff.toString());
System.out
.println(" id:" + id + " " + "\t抓取成功");
} else {
System.out.println(" id:" + id + " "
+ "\t抓取失败,响应长度:"
+ httpEntity.getContentLength());
}
}
}
semaphore.release();
} catch (Exception e) {
this.httpget.abort();
System.out.println(id + " - error: " + e);
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpget != null)
httpget.releaseConnection();
}
}
}
public synchronized String trimLineToString(HttpEntity entiry,
String charset) {
StringBuffer sb = new StringBuffer();
BufferedReader reader = null;
try {
InputStream instream = entiry.getContent();
reader = new BufferedReader(new InputStreamReader(instream,
charset));
String str = null;
while ((str = reader.readLine()) != null) {
if (str.trim().length() == 0) {
} else if (str.trim().contains("");
message.append("网络中断, ");
message.append(this.sleepMillisecondWhenNetWorkUnLinked);
message.append(" 毫秒后再次检测!<-------------");
System.out.println(message.toString());
}
}
public static void main(String[] args) {
NetworkManagement n = new NetworkManagement();
n.isSpontaneousNotice(false);
}
}