今天为了抓去杭电上的ACM,于是各种查资料,用python做成了一个自动抓取相应玩个信息的程序
主要用到了Python的urllib模块和re模块 基本思路,先抓取网页的全部源代码,然后在用正则提取相应的内容
1: # -*- coding: utf-8 -*-
2:
3: import re
4: import urllib
5: import MySQLdb
6:
7:
8: def down(i):
9: url= 'http://acm.hdu.edu.cn/showproblem.php?pid='+repr(i)
10: hdid=i
11: try:
12: conn=MySQLdb.connect(host='localhost',user='root',passwd='6191080',db='gglg',port=3306,charset='gb2312')
13: cur=conn.cursor()
14: cur.execute('select * from acm')
15: except MySQLdb.Error,e:
16: print "Mysql Error %d: %s" % (e.args[0], e.args[1])
17:
18: f = urllib.urlopen(url)
19: html = f.read()
20: description = re.compile(u'Problem Description.*Author', re.DOTALL)
21:
22: style = description.search(html)
23:
24:
25: if style:
26: html = style.group(0)
27: para = re.sub('<[^>]*>', '', html);
28: para=para.replace(' ','')
29:
30: list1=para.rsplit(';')
31:
32:
33:
34: list1[0]=list1[0].replace('Problem Description','')
35: list1[1]=list1[1].replace('Input','')
36: list1[2]=list1[2].replace('Output','')
37: list1[3]=list1[3].replace('Sample Input','')
38: list1[4]=list1[4].replace('Sample Output','')
39: for i in range(0,5):
40: print list1[i]
41: sql="INSERT INTO acm (Description,Input,Output,samplein,sampleout,HDid) VALUES (\""+list1[0]+"\",\""+list1[1]+"\",\""+list1[2]+"\",\""+list1[3]+"\",\""+list1[4]+"\",\""+repr(hdid)+"\")"
42: cur.execute(sql)
43: print sql
44: conn.commit()
45: cur.close()
46: conn.close()
47:
48: else:
49: print 'Not found.'
50:
51:
52: def main():
53: start=0;
54: end=4000;
55:
56: for i in range(strat,end):
57:
58: down(i)
59: main()
fff