很多网站对访问量的判断并不严格,只要页面被点击即视为有效访问,但是应该现在反爬虫越来越严格,为防止部分网站会对IP进行拉黑,所以这里写一个小程序爬取代理IP,并使用代理IP刷访问量。原本想把代理IP包装成一个bean类,但是发现爬下来的代理IP都不需要用户名和密码,那就只要ip和端口就行了,索性实现得简单点,只关心ip和端口好了。
FileUtil:用于提供爬取到的IP和url的写入文件、读取文件的接口。
CheckUtil:用于校验爬取到的IP是否可用。
SpiderUtil:爬虫动作的主要模块,用于爬取IP和需要刷访问量的url。
ClickUtil:使用代理IP访问指定url。
包含write、read两个方法,传入文件名,向该文件中写入(读取)数据。
package com.zixuan.add_uv.utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
public class FileUtil {
//追加写入数据
public void write(String selection,String data,boolean isAppend){
//输出流
File file = new File(System.getProperty("user.dir")+"/conf");
if(!file.exists()){
file.mkdir();
}
try{
if (selection.toLowerCase().equals("ip")){
file = new File(System.getProperty("user.dir")+"/conf/ip.txt");
}
if(selection.toLowerCase().equals("url")){
file = new File(System.getProperty("user.dir")+"/conf/url.txt");
}
FileOutputStream fos = new FileOutputStream(file,isAppend);
fos.write(data.getBytes());
fos.write("\r\n".getBytes());
fos.close();
}catch (Exception e){
System.out.println("写入文件失败。");
}
}
//读取文件,并将文件内容写入一个list里,返回该list
public List readFile(String fileName){
List listStr = new ArrayList<>();
//输入流
String path = FileUtil.class.getResource("").getPath();
File file = new File(System.getProperty("user.dir")+"/conf/"+fileName);
try{
FileInputStream is = new FileInputStream(file);
Scanner scanner = new Scanner(is);
while (scanner.hasNextLine()) {
listStr.add(scanner.nextLine());
}
scanner.close();
is.close();
} catch (Exception e) {
System.out.println("读取文件失败");
}
return listStr;
}
}
使用代理IP访问百度,如果能够访问,则返回true,不能访问则返回false。
package com.zixuan.add_uv.utils;
import org.jsoup.Jsoup;
public class CheckIPUtil {
//测试代理IP是否可用
public static boolean checkProxy(String ip, Integer port) {
System.out.println("检查中:"+ip);
try {
Jsoup.connect("http://www.baidu.com")
.timeout(1 * 1000)
.proxy(ip, port)
.get();
System.out.println(ip+"可用");
return true;
} catch (Exception e) {
System.out.println("失败,"+ip+"不可用");
return false;
}
}
public static boolean checkProxy(String s){
String[] strings = s.split(" ");
return checkProxy(strings[0],Integer.parseInt(strings[1]));
}
}
主要实现爬取代理IP和url两个功能。
爬取代理IP:指定代理IP网站,以及爬取的页数。从页面中获取代理IP和端口后,使用CheckUtil中的校验方法检验,如果可用,则追加写入ip.txt文件中。并实现了runable接口,可以使用多线程进行爬取,提高效率。
爬取url:修改代码中中文写的两处地方。分别是正则表达式和爬取的页面。举个例子,如果要刷qq空间访问量,你需要传入一个用户的主页,然后用正则匹配出用户主页的说说的url,程序会自动爬取说说的url并写入到url.txt文件。
package com.zixuan.add_uv.utils;
import com.alibaba.fastjson.JSONObject;
import com.zixuan.add_uv.bean.ProxyInfo;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SpiderUtil {
static FileUtil fileUtil = new FileUtil();
//爬取代理IP
public static void spiderIP(String url, int totalPage) {
String ipReg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}";
Pattern ipPtn = Pattern.compile(ipReg);
for (int i = 1; i <= totalPage; i++) {
System.out.println("开始爬取第"+i+"/"+totalPage+"页...");
//发送请求,获取文档
Document doc = null;
try {
doc = getDocument(url + i , "www.kuaidaili.com");
} catch (IOException e) {
System.out.println("链接不可用,爬取失败:"+url+i);
return;
}
Matcher m = ipPtn.matcher(doc.text());
while (m.find()) {
String s = m.group();
if (CheckIPUtil.checkProxy(s)) {
fileUtil.write("IP", s, true);
}
}
}
}
//爬取博客url
public static void spiderUrl(String username) {
HashSet urlSet = new HashSet();
String urlReg = "这里写匹配页面中爬取的正则";
Pattern urlPtn = Pattern.compile(urlReg);
Document doc = null;
try {
doc = getDocument("这里写要爬取的页面", "爬取网站的host");
} catch (IOException e) {
e.printStackTrace();
return;
}
Matcher m = urlPtn.matcher(doc.body().html());
while (m.find()) {
String s = m.group();
urlSet.add(s);
}
Iterator iterator = urlSet.iterator();
while (iterator.hasNext()) {
String s = iterator.next();
System.out.println(s);
fileUtil.write("URL", s, true);
}
}
//获取页面
public static Document getDocument(String url, String host) throws IOException {
Document doc = null;
doc = Jsoup.connect(url)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate, sdch")
.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6")
.header("Cache-Control", "max-age=0")
.header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
.header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
.header("Host", host)
.header("Referer", "https://" + host + "/")
.timeout(30 * 1000)
.get();
return doc;
}
//创建爬ip的runnable对象
public static spiderIpExcutor excutorBulid(String url,int totalPage){
return new spiderIpExcutor(url, totalPage);
}
//执行爬虫的runnable类
static class spiderIpExcutor implements Runnable{
String url = null;
int totalPage = 0;
public spiderIpExcutor(String url,int totalPage){
this.url=url;
this.totalPage=totalPage;
}
@Override
public void run() {
if (url.equals("")||url==null||totalPage<=0){
System.out.println("参数错误");
}else {
spiderIP(url,totalPage);
}
}
}
}
click:点击行为的主要实现。
clickAll:传入一个IP,使用这个ip访问所有的url。在此方法中调用click方法。
ClickExcutor:实现runable接口,使用多线程进行访问,提高刷uv的效率。
package com.zixuan.add_uv.utils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
public class ClickUtil {
//访问url
public static void click(String url, String proxy) throws IOException {
String proxyIP = proxy.split(" ")[0];
int proxyPort = Integer.parseInt(proxy.split(" ")[1]);
Document doc = Jsoup.connect(url)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate, sdch")
.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6")
.header("Cache-Control", "max-age=0")
.header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
.header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
.header("Host", "写目标网站的host")
.header("Referer", "写上一个页面的地址,指明是从哪个页面跳转到这里的")
.timeout(10 * 1000)
.proxy(proxyIP, proxyPort)
.ignoreContentType(true)
.get();
try {
Thread.sleep(5*1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
//使用一个IP访问所有url
//如果失败三次,则停止,下一个IP
public static void clickAll() {
FileUtil fileUtil = new FileUtil();
Iterator ips = fileUtil.readFile("ip.txt").iterator();
while (ips.hasNext()) {
String ip = ips.next();
int exceptionFlag = 0;
Iterator urls = fileUtil.readFile("url.txt").iterator();
while (urls.hasNext()) {
String url = urls.next();
System.out.println("尝试访问:"+url+"\n 使用代理:"+ip);
try {
click(url, ip);
} catch (IOException e) {
exceptionFlag++;
}
if(exceptionFlag>=3){
break;
}
}
}
}
//获取excutor的build方法
public static ClickExcutor excutorBuild(int time){
return new ClickExcutor(time);
}
//点击行为的runable类
static class ClickExcutor implements Runnable{
int time = 1;
public ClickExcutor(int time){
if(time>1) {
this.time = time;
}else {
System.out.println("输入次数不正确,默认执行一次");
}
}
@Override
public void run() {
for (int i = 0; i < time; i++) {
clickAll();
}
}
}
}
程序的入口,使用线程池同时爬取多个网站的代理IP。并在30s延迟后,开始刷uv。
package com.zixuan.add_uv.controler;
import com.zixuan.add_uv.utils.ClickUtil;
import com.zixuan.add_uv.utils.FileUtil;
import com.zixuan.add_uv.utils.SpiderUtil;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class Controler {
public static void main(String[] args) {
ScheduledExecutorService scheduledThreadPool = Executors.newScheduledThreadPool(8);
scheduledThreadPool.schedule(SpiderUtil.excutorBulid("https://www.xicidaili.com/nn/",150),1, TimeUnit.SECONDS);
scheduledThreadPool.schedule(SpiderUtil.excutorBulid("https://www.xicidaili.com/nt/",150),1,TimeUnit.SECONDS);
scheduledThreadPool.schedule(SpiderUtil.excutorBulid("https://www.xicidaili.com/wt/",150),1,TimeUnit.SECONDS);
scheduledThreadPool.schedule(SpiderUtil.excutorBulid("https://www.xicidaili.com/wn/",150),1,TimeUnit.SECONDS);
scheduledThreadPool.schedule(SpiderUtil.excutorBulid("https://ip.jiangxianli.com/?page=",150),1,TimeUnit.SECONDS);
SpiderUtil.spiderUrl("xxxxx");
scheduledThreadPool.schedule(ClickUtil.excutorBuild(5000),30,TimeUnit.SECONDS);
scheduledThreadPool.schedule(ClickUtil.excutorBuild(5000),60,TimeUnit.SECONDS);
scheduledThreadPool.schedule(ClickUtil.excutorBuild(5000),90,TimeUnit.SECONDS);
}
}