1、工作的背景及意义:
由于经常要查找IP的运营商等相关属性,数据量不多的情况下手动查找还比较好,但是在数据量超过几百,几千,这样手动查找就比较费力了,这样机械的工作交给计算机处理最合适了。没有必要人为的机械性查找。
2、ip抓取第一阶段需求:
从这个链接http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest,抓取含有CN、ipv4的IP数据,再发、发送请求到数据查询中心http://wq.apnic.net/apnic-bin/whois.pl,获取返回的数据,并抓取其中ip相关属性的数据并保存。
3、程序文档分析:
3.1抓取(http:ftp……)连接下txt文本(含有CN\ipv4)的数据,将抓取到的数据存进HTJF.txt。【通过程序中的getmail()和savetxt()方法完成】
3.2发送请求到Ip查询网站,并接收返回的html文本(通过程序中的readtxt()\testpost方法完成)。
3.3对接收的文本进行解析、过滤
第一次过滤:过滤完毕存进IpHTML.txt
第二次过滤:过滤完毕存进Ip1.txt
第三次过滤:过滤完毕存进Ip2.txt
最后入库:把IP的相关属性封装成对象存入数据前先遍历存不存在该IP段,
存在:不执行
不存在:执行JDBC操作
4、程序性能描述:
从7万多条数据抓到3千几条数据,
全程跑完历时:50分钟。期间抛出一次异常。
代码如下:
package com.htjf.ip; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.URL; import java.net.URLConnection; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.helper.StringUtil; import org.jsoup.nodes.Document; /** * @author Qixuan * */ public class IpDemo { /** * @param args * 程序入口 * @throws IOException */ public static void main(String args[]) throws IOException { // Document doc=null; // doc = // Jsoup.connect("http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest").timeout(1000000).get(); try { System.out.println("爬取"); // List<String> list=getMail(); System.out.println("保存"); // savetxt(list); System.out.println("发送请求"); readtxt();// 发送请求并进行多个规则过滤 /* * testPost("112.46.78.4");//发送请求, saveLastIP();//规则1 IpModel * ipModel=saveLastIP2();//规则2 * * MySql ipsql=new MySql(); ipsql.insertIp(ipModel);//存进数据库 */ } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }// ///////// /* * 1、读取文件 2、对读取的数据进行规则匹配,从中获取符合规则的数据 3、将符合规则的数据储存到集合中 */ public static List<String> getMail() throws Exception { // /从本地文件中爬 // BufferedReader br=new BufferedReader(new // FileReader("d:\\mail.html")); // /从网络文件中爬 URL url = new URL( "http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest"); BufferedReader br = new BufferedReader(new InputStreamReader( url.openStream())); String mail_regex = "CN\\|ipv4";// /关键字的匹配规则 Pattern p = Pattern.compile(mail_regex);// 将规则封装成对象 List<String> list = new ArrayList<String>(); String line = null; while ((line = br.readLine()) != null) { Matcher m = p.matcher(line);// 一行一行地进行匹配 while (m.find()) { // m.group()找到就放进组里 // list.add(m.group()); list.add(line); } } return list; } /** * @param list * 将爬取到的含有CN、ipv4的数据存到HTJF.txt中 * @throws FileNotFoundException */ public static void savetxt(List<String> listarray) throws FileNotFoundException { /* 将A文件中的内容,保存到B文件中 */ // BufferedReader bufr=new BufferedReader(new InputStreamReader(new // FileInputStream("D://xuan.txt"))); BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream("E://HTJF.txt"))); try { System.out.println("有多少条记录:" + listarray.size()); System.out.println("爬到的资源"); /* * List<String> cnList=new ArrayList<String>(); List<String> * ipv4List=new ArrayList<String>(); List<String> ipList=new * ArrayList<String>(); */ String[] str = new String[10]; for (String mail : listarray) { System.out.println("====>" + mail); str = mail.split("\\|"); String line = null; // int length=str.length; bufw.write(str[3]); bufw.write(","); bufw.write(str[2]); bufw.write(","); bufw.write(str[1]); bufw.newLine();// /换行 bufw.flush();// 刷新 } bufw.close();// } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }// /////// /** * 读取文本,并Post到ip运营商查询网站中 * * @throws IOException */ public static void readtxt() throws IOException { BufferedReader bufr = new BufferedReader(new InputStreamReader( new FileInputStream("E://HTJF.txt"))); String[] array = new String[10]; Document doc = null; String line = null; while ((line = bufr.readLine()) != null) { array = line.split("\\,"); System.out.println("ip地址:" + array[0]); String searchtextIp = array[0]; testPost(searchtextIp);// 发送请求并过滤,调用3000几次 } } /** * @throws IOException * 使用java程序模拟页面发送http的post请求,并过滤标签 */ public static void testPost(String iptest) throws IOException { /** * 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using * java.net.URL and //java.net.URLConnection * * 使用页面发送请求的正常流程:在页面http://www.faircanton.com/message/loginlytebox. * asp中输入用户名和密码,然后按登录, * 跳转到页面http://www.faircanton.com/message/check.asp进行验证 验证的的结果返回到另一个页面 * * 使用java程序发送请求的流程:使用URLConnection向http://www.faircanton.com/message/ * check.asp发送请求 并传递两个参数:用户名和密码 然后用程序获取验证结果 */ URL url = new URL("http://wq.apnic.net/apnic-bin/whois.pl"); URLConnection connection = url.openConnection(); /** * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做: */ connection.setDoOutput(true); /** * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ... */ OutputStreamWriter out = new OutputStreamWriter( connection.getOutputStream(), "8859_1"); out.write("searchtext=" + iptest + "&form_type=advanced"); // 向页面传递数据。post的关键所在! // out.write("username=kevin&password=*********"); //向页面传递数据。post的关键所在! // remember to clean up out.flush(); out.close(); /** * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT: * text/plain Content-type: application/x-www-form-urlencoded * Content-length: 99 username=bob password=someword */ // 一旦发送成功,用以下方法就可以得到服务器的回应: String sCurrentLine; String sTotalString; sCurrentLine = ""; sTotalString = ""; InputStream l_urlStream; l_urlStream = connection.getInputStream(); // 传说中的三层包装阿! String mail_regex = "<(.[^>]*)>";// /过滤标签的规则 Pattern p = Pattern.compile(mail_regex);// 将规则封装成对象 BufferedReader l_reader = new BufferedReader(new InputStreamReader( l_urlStream)); BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream("E://IpHTML.txt"))); while ((sCurrentLine = l_reader.readLine()) != null) { // sTotalString += sCurrentLine + "/r/n"; // Matcher m=p.matcher(sCurrentLine);//一行一行地进行匹配 // Matcher m=p.matcher(sCurrentLine);//一行一行地进行匹配 sCurrentLine = sCurrentLine.replaceAll(mail_regex, "").trim(); bufw.write(sCurrentLine); bufw.newLine();// /换行 bufw.flush();// 刷新 } bufw.close(); // System.out.println("页面相应的内容"); // System.out.println(sTotalString); System.out.println("第一次过滤完毕,开始下一轮过滤"); saveLastIP();// 第二次过滤 }// /////////////////// /** * @throws IOException * 匹配ip所需要的字段1 */ public static void saveLastIP() throws IOException { BufferedReader bufr = new BufferedReader(new InputStreamReader( new FileInputStream("E://IpHTML.txt"))); BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream("E://Ip1.txt"))); String[] mail_regex1 = { "inetnum:", "netname:", "descr:", "country:" };// /IP的匹配规则 String line = null; while ((line = bufr.readLine()) != null) { for (int i = 0; i < mail_regex1.length; i++) { Pattern p = Pattern.compile(mail_regex1[i]);// 将规则封装成对象 Matcher m = p.matcher(line);// 一行一行地进行匹配 while (m.find()) { Pattern p2 = Pattern.compile("\\s*|\t|\r|\n"); Matcher m2 = p2.matcher(line); String line2 = m2.replaceAll(""); bufw.write(line2); bufw.newLine();// /换行 bufw.flush();// 刷新 } } } bufw.close(); System.out.println("第二次过滤完毕,开始下一轮过滤"); saveLastIP2();// 第三次过滤 } /** * @throws IOException * 匹配ip所需要的字段 */ public static void saveLastIP2() throws IOException { BufferedReader bufr = new BufferedReader(new InputStreamReader( new FileInputStream("E://Ip1.txt"))); BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream("E://Ip2.txt"))); String[] array = new String[2]; String[] temp = new String[2]; IpModel ipModel = new IpModel(); int k = 1; String line = null; while ((line = bufr.readLine()) != null) { array = line.split("\\:"); if ("inetnum".equals(array[0])) { temp = array[1].split("\\-"); ipModel.setStartIp(temp[0]); ipModel.setEndIp(temp[1]); } else if ("netname".equals(array[0])) { if (array[1].indexOf("-") > 0) { temp = array[1].split("\\-"); ipModel.setProvince(temp[0]); ipModel.setAttribution(temp[1]); } else { ipModel.setProvince(""); ipModel.setAttribution(array[1]); } } else if ("descr".equals(array[0])) { if (k == 1) { try { if (StringUtil.isBlank(array[1])) { ipModel.setOperator(""); } else { ipModel.setOperator(array[1]); } } catch (ArrayIndexOutOfBoundsException e) { System.out.println("数组越界!"); e.printStackTrace(); } } else if (k == 2) { try { if (StringUtil.isBlank(array[1])) { ipModel.setOperator(""); } else { ipModel.setOperator(array[1]); } } catch (ArrayIndexOutOfBoundsException e) { System.out.println("数组越界!"); e.printStackTrace(); } } else if (k == 3) { try { if (StringUtil.isBlank(array[1])) { ipModel.setOperator(""); } else { ipModel.setOperator(array[1]); } } catch (ArrayIndexOutOfBoundsException e) { System.out.println("数组越界!"); e.printStackTrace(); } } else if (k == 4) { try { if (StringUtil.isBlank(array[1])) { ipModel.setOperator(""); } else { ipModel.setOperator(array[1]); } } catch (ArrayIndexOutOfBoundsException e) { System.out.println("数组越界!"); e.printStackTrace(); } } k++; } else if ("country".equals(array[0])) { ipModel.setCountry(array[1]); } } bufw.write(ipModel.getCountry()); bufw.write(" ");// /换行 bufw.write(ipModel.getAttribution()); // 归属地 bufw.write(" ");// /换行 bufw.write(ipModel.getProvince()); bufw.write(" ");// /换行 bufw.write(ipModel.getOperator());// 运营商 bufw.write(" ");// /换行 bufw.write(ipModel.getStartIp()); bufw.write(" ");// /换行 bufw.write(ipModel.getEndIp()); bufw.newLine();// /换行 bufw.flush();// 刷新 bufw.close(); SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddhhmmss"); String ipId = getRandomString(14) + sdf.format(new Date()); ipModel.setIpId(ipId); System.out.println("匹配完毕保存到数据库"); MySql ipsqlSql = new MySql(); ipsqlSql.insertIp(ipModel); // return ipModel; } /** * @param length * @return 生成随机数 */ public static String getRandomString(int length) { // length表示生成字符串的长度 String base = "abcdefghijklmnopqrstuvwxyz0123456789"; Random random = new Random(); StringBuffer sb = new StringBuffer(); for (int i = 0; i < length; i++) { int number = random.nextInt(base.length()); sb.append(base.charAt(number)); } return sb.toString(); } }// /////////////////////////// class MySql { public static String username; public static String password; public static Connection connection; public static PreparedStatement ps; // //构造函数 public MySql() { String url = "jdbc:mysql://127.0.0.1:3306/ipselect?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull"; String username = "root"; String password = ""; // 加载驱动程序以连接数据库 try { Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection(url, username, password); } // 捕获加载驱动程序异常 catch (ClassNotFoundException cnfex) { System.err.println("装载 JDBC/ODBC 驱动程序失败"); cnfex.printStackTrace(); } // 捕获连接数据库异常 catch (SQLException sqlex) { System.err.println("无法连接数据库"); sqlex.printStackTrace(); } } /** * @param ipModel * private String country;//国家地区 private String province;//省份 * private String operator;//运营商 private String attribution;//归属地 * private String startIp;//起始Ip private String endIp;//结束Ip * */ public void insertIp(IpModel ipModel) { MySql ipsql = new MySql(); List<IpModel> list = ipsql.findIp(ipModel); if (list.size() > 0) { System.out.println("已存在有数据"); } else { try { ps = connection .prepareStatement("insert into iptable (ip_id,country,province,operator,attribution,startIp,endIp) values (?,?,?,?,?,?,?)"); /* * SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss"); * String ipId=sdf.format(new Date()); */ ps.setString(1, ipModel.getIpId()); ps.setString(2, ipModel.getCountry()); ps.setString(3, ipModel.getProvince()); ps.setString(4, ipModel.getOperator()); ps.setString(5, ipModel.getAttribution()); ps.setString(6, ipModel.getStartIp()); ps.setString(7, ipModel.getEndIp()); ps.executeUpdate(); System.out.println("记录插入成功"); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * @param ipModel * 更新update table set a=REPLACE(a,'1','2'); */ public void updateIp(IpModel ipModel) { try { ps = connection .prepareStatement("update iptable set(country=?,province=?,operator=?,attribution=?,startIp,endIp=?) where ip_id=?"); ps.setString(1, ipModel.getIpId()); ps.setString(2, ipModel.getCountry()); ps.setString(3, ipModel.getProvince()); ps.setString(4, ipModel.getOperator()); ps.setString(5, ipModel.getAttribution()); ps.setString(6, ipModel.getStartIp()); ps.setString(7, ipModel.getEndIp()); ps.executeUpdate(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * @param ipModel * @return 查询 */ public List<IpModel> findIp(IpModel ipModel) { java.util.List<IpModel> list = new ArrayList<IpModel>(); try { ps = connection .prepareStatement("select * from iptable where startIp=? and endIp=?"); ps.setString(1, ipModel.getStartIp()); ps.setString(2, ipModel.getEndIp()); ResultSet rs = ps.executeQuery(); IpModel ipmodel = new IpModel(); while (rs.next()) { ipmodel.setStartIp(rs.getString("ip_id")); ipmodel.setStartIp(rs.getString("startIp")); ipmodel.setStartIp(rs.getString("endIp")); list.add(ipmodel); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } return list; } }
package com.htjf.ip; /** * @author Qixuan * */ public class IpModel { private String ipId; private String country;// 国家地区 private String province;// 省份 private String operator;// 运营商 private String attribution;// 归属地 private String startIp;// 起始Ip private String endIp;// 结束Ip public String getIpId() { return ipId; } public void setIpId(String ipId) { this.ipId = ipId; } public String getCountry() { return country; } public void setCountry(String country) { this.country = country; } public String getProvince() { return province; } public void setProvince(String province) { this.province = province; } public String getOperator() { return operator; } public void setOperator(String operator) { this.operator = operator; } public String getAttribution() { return attribution; } public void setAttribution(String attribution) { this.attribution = attribution; } public String getStartIp() { return startIp; } public void setStartIp(String startIp) { this.startIp = startIp; } public String getEndIp() { return endIp; } public void setEndIp(String endIp) { this.endIp = endIp; } }