获取提交历史需要登陆,看了下登陆传递的参数。
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# author: wangzhenqing
# date: 2015-06-29 14:32:07
import requests
from BeautifulSoup import BeautifulSoup as soup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# 根据登陆url,用户名,密码获取登陆后的requests
def get_login_requests(login_url, user_id, password):
# 获取网页内容
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/43.0.2357.130 Safari/537.36'
}
s = requests.session()
data = dict(user_id=user_id, password=password)
s.post(login_url, data=data, headers=headers)
return s
# 获取每道题目的Java内容
def get_every_java_solution(s, prob_id, user_id):
# 获取网页内容
r = s.get('http://ac.jobdu.com/status.php?pid=' +
str(prob_id) + '&user_id=' + user_id)
# 修改编码格式,否则输出为乱码
r.encoding = 'utf-8'
data = r.text
# 获取所有table内容,主要获取我的提交历史。这里的table应该只有一个。
table = soup(data).findAll('table')
if len(table) == 0:
return ''
# 理论上来讲,获取第一个table就可以。好像实际也是这样
table = table[0]
# 开始获取每一行了,应该是每个tbody。有多少行就有多少个tbody
tbody = table.findAll('tbody')
tbodyLen = len(tbody)
for i in range(tbodyLen):
body = tbody[i]
# 针对每个body取出td。
tds = body.findAll(name="td")
yuyan = tds[8].text
# 列值是有规律的,不再遍历,10列
ac = tds[3].find('font').string
if ac != 'Accepted' or 'C++' not in yuyan:
continue
url = 'http://ac.jobdu.com/showsource.php?sid=' + tds[0].string
urlr = s.get(url)
urlr.encoding = 'utf-8'
code = urlr.text
pre = soup(code, convertEntities=soup.HTML_ENTITIES).findAll('pre')
if len(pre) == 0:
print prob_id
print url
print pre
print pre
return pre[0].string
# 获取文件名
def get_prob_names(s, prob_id):
r = s.get('http://ac.jobdu.com/problem.php?pid=' + str(prob_id))
r.encoding = 'utf-8'
if r.status_code != 200:
return ''
data = r.text
# 获取题目内容
table = soup(data, convertEntities=soup.HTML_ENTITIES)
# print table
head = table.findAll('dt', {"class": "title-hd"})
# print type(head[0])
return head[0].text
# 写文件
def write_file(filename, code):
print type(filename)
filename = filename.encode('utf-8', 'ignore')
file_object = open('c++/' + filename + '.cpp', 'w')
file_object.write(code)
file_object.close()
# 主函数
def print_all_problems():
num = 1557
start = 1001
login_url = 'http://ac.jobdu.com/login.php'
username = 'wangzhenqing'
password = 'password'
while start <= num:
s = get_login_requests(login_url, username, password)
code = get_every_java_solution(s, start, username)
if code is None or code == '':
start += 1
continue
print type(code)
filename = get_prob_names(s, start)
print filename
write_file(filename, code)
start += 1
if __name__ == '__main__':
print_all_problems()
print 'over'
/**
* @author:wangzq
* @email:[email protected]
* @date:2015-06-30 11:01:54
* @description:将抓取的代码进行代码格式化
*/
import org.eclipse.jdt.core.JavaCore;
import org.eclipse.jdt.core.ToolFactory;
import org.eclipse.jdt.core.formatter.CodeFormatter;
import org.eclipse.jdt.core.formatter.DefaultCodeFormatterConstants;
import org.eclipse.jface.text.Document;
import org.eclipse.jface.text.IDocument;
import org.eclipse.text.edits.TextEdit;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;
public class JavaCodeFormatUtils {
/**
* 调用eclipse jdt core对生成的java源码进行格式化
* 尝试对传入的JavaSourceFile格式化,此操作若成功则将改变传入对象的内容
*
* @author pf-miles 2014-4-16 下午2:48:29
*/
@SuppressWarnings({"rawtypes", "unchecked"})
public static String reformatCode(String code, String fileName) {
String formatCode = "";
Map m = DefaultCodeFormatterConstants.getEclipseDefaultSettings();
m.put(JavaCore.COMPILER_COMPLIANCE, 1.6);
m.put(JavaCore.COMPILER_CODEGEN_TARGET_PLATFORM, 1.6);
m.put(JavaCore.COMPILER_SOURCE, 1.6);
m.put(DefaultCodeFormatterConstants.FORMATTER_LINE_SPLIT, "80");
m.put(DefaultCodeFormatterConstants.FORMATTER_TAB_CHAR,
JavaCore.SPACE);
IDocument doc = null;
try {
CodeFormatter codeFormatter = ToolFactory.createCodeFormatter(m);
TextEdit textEdit = codeFormatter.format(
CodeFormatter.K_UNKNOWN, code, 0, code.length(), 0, null);
if (textEdit != null) {
doc = new Document(code);
textEdit.apply(doc);
formatCode += doc.get() + "\n";
}
} catch (Exception e) {
System.err.println("格式化文件出错" + e);
e.printStackTrace();
}
if ("".equals(formatCode)) {
System.out.println(fileName);
return code;
}
return formatCode;
}
/**
* @param file
* @return
* @Description: 获取文件内容
* @date 2013-7-11,下午04:30:48
* @author wangzq
* @version 3.0.0
*/
public static String getFileCode(File file) {
String code = "";
try {
String encoding = "utf-8";
if (file.isFile() && file.exists()) { // 判断文件是否存在
InputStreamReader read = new InputStreamReader(
new FileInputStream(file), encoding);// 考虑到编码格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
code += lineTxt + "\n";
}
read.close();
} else {
System.err.println("找不到指定的文件");
}
} catch (Exception e) {
System.err.println("读取文件内容出错");
e.printStackTrace();
}
return code;
}
/**
* 获取目录下的所有文件。
*
* @param path
*/
public static void getDirectoryFiles(String path, String newPath) {
File dirFile = new File(path);
if (!dirFile.isDirectory()) {
System.err.println(path + "不是文件夹,请检查!");
}
File[] files = dirFile.listFiles();
int k = 0;
for (File file : files) { // 遍历文件
String fileName = file.getName();
String head = getCodeHead(fileName);
String code = getFileCode(file);
code = head + code;
code = reformatCode(code, fileName);
if ("".equals(code)) {
System.out.println(fileName);
}
// System.out.println(code);
writeCodeToFile(newPath + File.separator + fileName, code);
k++;
}
}
/**
* 将内容写入文件
*
* @param filePath
* @param code
*/
public static void writeCodeToFile(String filePath, String code) {
try {
File file = new File(filePath);
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(code);// 往文件里写入字符串
} catch (FileNotFoundException e) {
System.err.println("写文件内容出错");
e.printStackTrace();
}
}
/**
* 给每个文件增加文件头
*
* @param fileName
* @return
*/
public static String getCodeHead(String fileName) {
String probId = fileName.substring(2, 6);
String probName = fileName.substring(0, fileName.length() - 5);
SimpleDateFormat dateFormat = new
SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String head = "" + "\n";
head += "" + "\n";
head += "// " + probName + "\n";
head += "" + "\n";
head += "/**" + "\n";
head += " * @author:wangzq" + "\n";
head += " * @email:[email protected]" + "\n";
head += " * @date:" + dateFormat.format(new Date()) + "\n";
head += " * @url:http://ac.jobdu.com/problem.php?pid=" + probId + "\n";
head += " */" + "\n";
return head;
}
public static void main(String[] args) {
String path = "/Users/wangzhenqing/git_work/java/test";
String newPath = "/Users/wangzhenqing/git_work/java/new";
getDirectoryFiles(path, newPath);
}
}
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# author: wangzhenqing
# date: 2015-06-30 13:47:19
import requests
from BeautifulSoup import BeautifulSoup as soup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# 获取csdn列表
def get_cdsn_url():
urls = []
# 获取网页内容
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/43.0.2357.130 Safari/537.36'
}
for x in xrange(1, 20):
print x
s = requests.session()
cur_url = 'http://blog.csdn.net/u013027996/article/list/' + str(x)
r = s.get(cur_url, headers=headers)
# 这里主要转义一些特殊符号,如<会有问题
r.encoding = 'utf-8'
# , 'id': 'article_list'
# print r.text
html = soup(r.text, convertEntities=soup.HTML_ENTITIES)
link_titles = html.findAll('span', {"class": "link_title"})
# print articles
for link_title in link_titles:
common_url = 'http://blog.csdn.net'
title = link_title.text
url = common_url + link_title.find('a')['href']
url = title + '||' + url
print url
if url in urls:
continue
urls.append(url)
return urls
# 写文件
def write_file(urls):
file_object = open('url.txt', 'w')
for url in urls:
file_object.write(url + '\n')
file_object.close()
if __name__ == '__main__':
urls = get_cdsn_url()
write_file(urls)
print 'over'
/**
* @author:wangzq
* @email:[email protected]
* @date:2015-06-30 11:01:54
* @description:将抓取的代码进行代码格式化
*/
import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
public class JavaCodeUrlUtils {
private static Map articleMap =
new HashMap();
/**
* @param file
* @return
* @Description: 获取文件内容
* @date 2013-7-11,下午04:30:48
* @author wangzq
* @version 3.0.0
*/
public static String getFileCode(File file, String fileName) {
fileName = fileName.substring(0, fileName.length() - 5);
System.out.println(fileName);
if (fileName.equals("题目1040:Prime Number")) {
System.out.println(111);
}
String code = "";
try {
String encoding = "utf-8";
if (file.isFile() && file.exists()) { // 判断文件是否存在
InputStreamReader read = new InputStreamReader(
new FileInputStream(file), encoding);// 考虑到编码格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
code += lineTxt + "\n";
if (lineTxt.contains("@url:http://ac.jobdu.com/problem.php?pid")
&& articleMap.containsKey(fileName)) {
System.out.println(fileName);
code += " * 解题思路参考csdn:" + articleMap.get(fileName) + "\n";
}
}
read.close();
} else {
System.err.println("找不到指定的文件");
}
} catch (Exception e) {
System.err.println("读取文件内容出错");
e.printStackTrace();
}
return code;
}
/**
* 获取目录下的所有文件。
*
* @param path
*/
public static void getDirectoryFiles(String path, String newPath) {
File dirFile = new File(path);
if (!dirFile.isDirectory()) {
System.err.println(path + "不是文件夹,请检查!");
}
File[] files = dirFile.listFiles();
for (File file : files) {
String fileName = file.getName();
String code = getFileCode(file, fileName);
if ("".equals(code)) {
System.out.println(fileName);
}
// System.out.println(code);
writeCodeToFile(newPath + File.separator + fileName, code);
}
}
/**
* 将内容写入文件
*
* @param filePath
* @param code
*/
public static void writeCodeToFile(String filePath, String code) {
try {
File file = new File(filePath);
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(code);// 往文件里写入字符串
} catch (FileNotFoundException e) {
System.err.println("写文件内容出错");
e.printStackTrace();
}
}
/**
* @param filename
* @return
* @Description: 获取博客日志信息
* @date 2015-06-30 16:54:22
* @author wangzq
*/
public static void getCSDNArticles(String filename) {
File file = new File(filename);
try {
String encoding = "utf-8";
if (file.isFile() && file.exists()) { // 判断文件是否存在
InputStreamReader read = new InputStreamReader(
new FileInputStream(file), encoding);// 考虑到编码格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
lineTxt = lineTxt.trim();
String array[] = lineTxt.split(Pattern.quote("||"));
String head = "";
String url = array[1].trim();
if (array[0].contains("&&")) {
String headArr[] = array[0].split("&&");
for (int i = 0; i < headArr.length; i++) {
if (!headArr[i].contains("LeetCode")) {
head = headArr[i].replace("【九度】", "").trim();
articleMap.put(head, url);
}
}
} else if (array[0].contains("【九度】")) {
head = array[0].replace("【九度】", "").trim();
articleMap.put(head, url);
}
}
read.close();
} else {
System.err.println("找不到指定的文件");
}
} catch (Exception e) {
System.err.println("读取文件内容出错");
e.printStackTrace();
}
}
public static void main(String[] args) {
String path = "/Users/wangzhenqing/git_work/java/test";
String newPath = "/Users/wangzhenqing/git_work/java/new";
getCSDNArticles("/Users/wangzhenqing/git_work/java/1.txt");
System.out.println(articleMap.size());
for (String head : articleMap.keySet()) {
System.out.println(head);
System.out.println(articleMap.get(head));
}
System.out.println(articleMap.get("题目1040:Prime Number"));
getDirectoryFiles(path, newPath);
}
}