学习了Java正则表达式,写了个帖子邮箱获取工具

 

看完了马士兵老师的正则表达式视频,想到了贴吧各种留邮箱,于是写了这个小工具。
输入网址(要加http://,可以单击“ 网址”粘贴),然后会获取网址内匹配的邮箱,包括全角的
截图:

学习了Java正则表达式,写了个帖子邮箱获取工具
 
代码:
//import java.awt.*;
import java.awt.datatransfer.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;

public class EmailSpider extends MouseAdapter implements ActionListener, KeyListener {
	
	private static final String INIT_URL = "http://tieba.baidu.com/p/2286659953?pn=4";
	private static final String URL_PREFIX = "http://tieba.baidu.com/p/"; // 贴吧帖子网址前缀
	private static final String WORDS = "\n\n    说明:\n    百度贴吧帖子获取所有页面的邮箱,其他的获取当前页。\n    匹配不全肯定是有的,只做了粗略匹配。\n    也有可能只匹配到一段,有些人邮箱输入方式有点奇葩。\n\n    单击“网址”可以粘贴剪贴板的网址。\n    右键菜单没有写,反正可以Ctrl+C Ctrl+V\n    提取时请不要进行操作。";
	
	private BufferedReader buffer;
	private Pattern pattern;
	private Matcher matcher;
	private String line; // 网页源代码的一行
	private List<String> list = new ArrayList<String>(); // 邮箱
	
	private JFrame frame;
	private JTextArea textArea;
	private JScrollPane scrollPane; // textArea的滚动条
	private JTextField textField;
	private JButton button1, button2;
	private JCheckBox checkBox;
	private JLabel label1, label2;
	private Clipboard clipboard; // 剪贴板
	
	private int total_page = 99999; // 总页数
	private String id; // 贴吧帖子id
	private StringBuffer sBuf; // 网址
	
	//private Thread thread;
	//private boolean isFinished;
	
	public EmailSpider() {
		
		//System.out.println(Integer.toHexString(".".codePointAt(0)));
		
		// 正则表达式匹配,包括@和.(全角).
		pattern = Pattern.compile("[\\w.-[\uff0e]]+[@[\uff20]][\\w-]+[\\.[\uff0e]]?[\\w.-[\uff0e]]*\\w");
		matcher = pattern.matcher("");
		
		frame = new JFrame("网页Email提取v1.0  by kyda");
		clipboard = frame.getToolkit().getSystemClipboard(); // 剪贴板
		
		label1 = new JLabel("<html><font size=4 color=green>网址:</font></html>");
		label1.setBounds(20, 10, 50, 30);
		label1.addMouseListener(this);
		textField = new JTextField(INIT_URL);
		textField.setBounds(70, 12, 310, 26);
		textField.addKeyListener(this);
		button1 = new JButton("<html><font size=4 color=blue>提取</font></html>");
		button1.setBounds(400, 10, 60, 30);
		button1.addActionListener(this);
		
		textArea = new JTextArea(WORDS);
		textArea.setLineWrap(true); // 自动换行
		scrollPane = new JScrollPane(textArea); // 添加滚动条
		scrollPane.setBounds(10, 50, 390, 290);
		scrollPane.setBorder(BorderFactory.createLoweredSoftBevelBorder()); // 边框样式
		
		//checkBox = new JCheckBox("<html><div style=\"top:-5px;\"><font size=5>换行</font></div></html>");
		checkBox = new JCheckBox("换行");
		// html会偏一点,不知道为什么。。。Font类有冲突,就这样new没什么问题吧
		checkBox.setFont(new java.awt.Font(checkBox.getFont().deriveFont((float) 16).getAttributes()));
		checkBox.setSelected(true);
		checkBox.setBounds(410, 100, 90, 30);
		checkBox.addActionListener(this);
		
		button2 = new JButton("<html><font size=3>复制到<br>剪贴板</font></html>");
		button2.setBounds(410, 160, 70, 40);
		button2.addActionListener(this);
		label2 = new JLabel("<html><font size=4 color=green>请选择操作:</font></html>");
		label2.setBounds(10, 340, 480, 30);
		
		frame.add(label1);
		frame.add(textField);
		frame.add(button1);
		frame.getContentPane().add(scrollPane);
		frame.add(checkBox);
		frame.add(button2);
		frame.add(label2);
		
		frame.setLayout(null);
		frame.setSize(500, 400);
		frame.setLocationRelativeTo(null); // 窗口居中
		frame.setResizable(false); // 不可改变大小
		frame.setVisible(true);
		frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
		
	}
	
	public void actionPerformed(ActionEvent e) {
		
		if (e.getSource() == button1) {
			startAnalyse();
		} else if (e.getSource() == button2) {
			// 复制到剪贴板
			String s = textArea.getText();
			if (s.length() > 0) {
				StringSelection ss = new StringSelection(s);
				this.clipboard.setContents(ss, ss);
				label2.setText("<html><font color=red>已复制到剪贴板。</font></html>");
			}
		}
	}
	
	// 对网址单击粘贴
	public void mouseClicked(MouseEvent e) {
		if (e.getButton() == MouseEvent.BUTTON1) {
			try {
				String s = clipboard.getContents(frame).getTransferData(DataFlavor.stringFlavor).toString();
				if (s.length() == 0) {
					label2.setText("剪贴板无内容!");
				} else {
					textField.setText(s);
					label2.setText("网址粘贴成功!");
				}
			} catch (UnsupportedFlavorException e1) {
				label2.setText("剪贴板内容不是字符串!");
			} catch (IOException e1) {
				e1.printStackTrace();
			}
		}
	}
	
	// 按enter可以处理
	public void keyPressed(KeyEvent e) {
		if (e.getKeyCode() == KeyEvent.VK_ENTER) {
			startAnalyse();
		}
	}
	
	public void keyReleased(KeyEvent arg0) {
	}
	
	public void keyTyped(KeyEvent arg0) {
	}
	
	// 页面处理前启动线程,实现setText实时更新
	public void startAnalyse() {
		
		new Thread(new Runnable() {
			
			public void run() {
				analyse();
			}
		}).start();
	}
	
	// 页面处理
	public void analyse() {
		
		label2.setText("邮箱地址提取中……");
		try {
			for (int i = 1; i <= total_page; ++i) {
				
				// 网址计算
				if (i == 1) {
					
					sBuf = new StringBuffer(textField.getText());
					if (sBuf.length() < 1)
						return;
					Matcher m = Pattern.compile("http://tieba.baidu.com/p/[0-9]{1,10}").matcher(textField.getText());
					
					if (m.find()) {
						id = m.group().substring(25); // 如果是贴吧获取帖子id
						sBuf = new StringBuffer(URL_PREFIX); // 转到第一页
						sBuf.append(id).append("?pn=").append(i);
						total_page = 99999;
						//System.out.println(id);
					} else { // 否则只处理当前页
						total_page = 1;
					}
					
				} else { // 每次循环页面地址递增1
				
					sBuf = new StringBuffer(URL_PREFIX);
					sBuf.append(id).append("?pn=").append(i);
					
				}
				
				// 获取网址内容
				URL url = new URL(sBuf.toString());
				buffer = new BufferedReader(new InputStreamReader(url.openStream()));
				
				// 获取总页数
				if (total_page == 99999) {
					
					BufferedReader tmpBuf = new BufferedReader(new InputStreamReader(url.openStream()));
					String s = "";
					while ((s = tmpBuf.readLine()) != null) {
						// 有些浏览器进不去的帖子java可以获取,并得到total_page,估计是被删的帖子
						// 然后进浏览器就会得到百度的温馨提示。。不敢玩了 - -  ,被封号就蛋疼了
						Matcher tmpMatcher = Pattern.compile("all_page_num:[0-9]{1,6}").matcher(s);
						if (tmpMatcher.find()) {
							total_page = Integer.valueOf(tmpMatcher.group().substring(13));
							break;
						}
					}
					
					if (total_page == 99999) { // 没有获取到页数
						label2.setText("貌似出错了~你真确定有这个帖子?");
						return;
					}
				}
				
				label2.setText("正在提取页面:" + i + "  共" + total_page + "页   地址:" + sBuf);
				
				// 逐行提取email地址
				while ((line = buffer.readLine()) != null) {
					getEmailAddr(line);
				}
			}
			
			StringBuffer result = new StringBuffer("");
			for (String s : list) {
				result.append(s);
			}
			textArea.setText(result.toString());
			label2.setText("提取完成!  邮箱总计:" + list.size());
			list.clear(); // 清空list
			
		} catch (MalformedURLException | IllegalArgumentException | UnknownHostException e2) {
			label2.setText("请输入正确的网址!");
			return;
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
	
	// 提取一行字符串中的email地址
	public void getEmailAddr(String str) {
		
		matcher.reset(line);
		
		while (matcher.find()) {
			//System.out.println(matcher.group());
			
			// 用StringBuffer的话存一个进list后list.contains就一直true了,原因未知。
			String s = matcher.group();
			//if (s.equals("[email protected]")) 
			//	System.out.println(line);
			s = s.replaceAll("\uff20", "@");
			s = s.replaceAll("\uff0e", ".");
			s += ';';
			if (checkBox.isSelected())
				s += '\n';
			
			// 用ArrayList以防止重复
			if (!list.contains(s)) {
				list.add(s);
			}
		}
		
	}
	
	public static void main(String[] args) {
		
		// 使用Windows的界面风格
		try {
			UIManager.setLookAndFeel("com.sun.java.swing.plaf.windows.WindowsLookAndFeel");
		} catch (Exception e) {
			e.printStackTrace();
		}
		
		new EmailSpider();
		
	}
}
 
 

你可能感兴趣的:(java)