数据: 点击下载
代码: 点击下载
public class ToutiaoArticles { public static void main(String[] args) { new ToutiaoArticles().fetch(); } public void fetch() { LocalDate startDate = new LocalDate(2014, 9, 27); LocalDate endDate = LocalDate.now(); File outputFile = new File("D://data.csv"); String baseUrl = "http://toutiao.io/prev/"; PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager(); mgr.setMaxTotal(5); mgr.setDefaultMaxPerRoute(5); HttpClient httpClient = HttpClientBuilder.create().setConnectionManager(mgr).build(); HttpGet httpGet = null; String date = null; String url = null; List linkInfos = null; StringBuffer articleInfos = null; while (startDate.isBefore(endDate) || startDate.isEqual(endDate)) { date = startDate.toString("yyyy-MM-dd"); url = baseUrl + date; System.out.println("[URL]-----" + url); httpGet = new HttpGet(url); try { linkInfos = httpClient.execute(httpGet, new PageResponseHandler()); if (linkInfos != null) { articleInfos = new StringBuffer(); for (int i = 0; i < linkInfos.size(); i++) { Link k = linkInfos.get(i); String data = date + "," + (i+1) + "," + k.getTitle() + "," + k.getOriginLink() + "," + k.getLink(); System.out.println(data); articleInfos.append(data + "\r\n"); } FileUtils.writeStringToFile(outputFile, articleInfos.toString(), "GBK", true); } } catch (Exception e) { e.printStackTrace(); } finally { httpGet.releaseConnection(); } startDate = startDate.plusDays(1); } } class PageResponseHandler implements ResponseHandler> { @Override public List handleResponse(HttpResponse response) throws ClientProtocolException, IOException { HttpEntity entity = response.getEntity(); if (response.getStatusLine().getStatusCode() >= 300) { EntityUtils.consume(entity); return null; } if (entity == null) { return null; } RequestConfig requestConfig = RequestConfig.custom().setRedirectsEnabled(false).build(); PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager(); mgr.setMaxTotal(5); mgr.setDefaultMaxPerRoute(5); HttpClient httpClient = HttpClientBuilder.create().setDefaultRequestConfig(requestConfig).setConnectionManager(mgr).build(); HttpGet httpGet = null; HttpResponse httpResponse = null; List linkInfos = new ArrayList(); Link lk = null; String html = EntityUtils.toString(entity); Document document = Jsoup.parse(html); Elements links = document.getElementsByAttributeValue("target", "_blank"); for (int i = 0; i < links.size(); i++) { lk = new Link(); lk.setLink(links.get(i).attr("href")); lk.setTitle(links.get(i).text()); httpGet = new HttpGet(lk.getLink()); try { httpResponse = httpClient.execute(httpGet); if (httpResponse.getStatusLine().getStatusCode() == 302) { String loc = httpResponse.getLastHeader("Location").getValue(); loc = loc.replaceAll("hmsr=toutiao.io", ""); loc = loc.replaceAll("&utm_medium=toutiao.io", ""); loc = loc.replaceAll("&utm_source=toutiao.io", ""); lk.setOriginLink(loc); } } catch (Exception e) { e.printStackTrace(); } finally { httpGet.releaseConnection(); } linkInfos.add(lk); } return linkInfos; } } class Link { private String title; private String link; private String originLink; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getLink() { return link; } public void setLink(String link) { this.link = link; } public String getOriginLink() { return originLink; } public void setOriginLink(String originLink) { this.originLink = originLink; } } }