代码如下
# -*- coding:utf-8 -*-
'''crawhitteacherspersonalinformation
auther : wud
date : 2017/12/4
version : 1.0
'''
# encoding:utf-8
import urllib2
import random
import MySQLdb
import requests
import re
from time import sleep
def main():
db = MySQLdb.connect("***.*.***.***", "*****", "*******", "***", charset="utf8")
cursor = db.cursor()
f = open("url.txt", 'r')
flag = 1
while (flag <= 1):
url = f.readline()[:-1]
print flag
print url
try:
my_headers = ["Mozilla/5.0 (Windows NT 6.3; Win64; x64) 。。。 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"]
r = getContent(url, my_headers)
print r
name_keyword =re.compile(r'''(.*?)''', re.U|re.S)
name = ''.join(name_keyword.findall(r))
telphone_keyword = re.compile(r'''电话(.*?)''', re.U|re.S)
telphone = ''.join(telphone_keyword.findall(r))
print telphone
email_keyword = re.compile(r'''邮箱, re.U|re.S)
email = ''.join(email_keyword.findall(r))
print email
#print name
address_keyword = re.compile(r'''地址(.*?)''',re.U|re.S)
address = ''.join(address_keyword.findall(r))
print address
picture_keyword = re.compile(r'''style="cursor:default;">''', re.U|re.S)
picture = ''.join(picture_keyword.findall(r))
#print type(picture)
picture = "http://homepage.hit.edu.cn" + picture[1:-1]
#print picture
zhicheng_keyword = re.compile(r''' (.*?)''', re.U|re.S)
zhicheng = ''.join(zhicheng_keyword.findall(r))
#print zhicheng
#major_keyword = re.compile(r'''(.*?)''', re.U|re.S)
#major = major_keyword.findall(r)
#print major
xueyuan_keyword = re.compile(r'''目前就职(.*?) ''', re.U|re.S)
xueyuan = ''.join(xueyuan_keyword.findall(r))
#print xueyuan
information = str(name) + " " + " " + str(zhicheng) + " " + str(xueyuan) + " " + email + " " + telphone + " " + address + " " + str(picture)
f0 = open("hitteacherspersonalinfromation_diff.txt",'a+')
#f2 = open("hitteacherspersonalinfromation_picture.txt", 'r+')
f6 = open("hitteacherspersonalinfromation.txt", 'a+')
#print >> f0, name
#print >> f2, picture
print >> f6, information
print information
flag+=1
#f0.close()
#f2.close()
f6.close()
if xueyuan != None and zhicheng != None:
cursor.execute('INSERT INTO PersonalInfo_1(name,zhicheng,xueyuan,picture,tel,email,address)VALUES (%s,%s,%s,%s,%s,%s,%s)',(name, zhicheng, xueyuan, picture,telphone,email,address))
db.commit()
print "SAVE IT!"
else :
print "SAVE FAIL"
print >>f0, url
f0.close()
except:
flag += 1
f7 = open("fail.txt", 'r+')
print >> f7, url
f7.close()
print "requests error"
pass
f.close()
print "END"
def getContent(url, headers):
random_header = random.choice(headers)
req = urllib2.Request(url)
req.add_header("User-Agent", random_header)
req.add_header("GET", url)
req.add_header("Host", "blog.csdn.net")
req.add_header("Referer", "http://www.csdn.net/")
content = urllib2.urlopen(req).read()
return content
if __name__=='__main__':
main()