python爬虫

爬取某高校教师信息

请正当合理使用,否则请关闭此博客

代码如下

# -*- coding:utf-8 -*-
'''crawhitteacherspersonalinformation
    auther      :       wud
    date        :       2017/12/4
    version     :       1.0
'''
# encoding:utf-8
import urllib2
import random
import MySQLdb
import requests
import re
from time import sleep

def main():
    db = MySQLdb.connect("***.*.***.***", "*****", "*******", "***", charset="utf8")
    cursor = db.cursor()
    f = open("url.txt", 'r')
    flag = 1
    while (flag <= 1):
        url = f.readline()[:-1]
        print flag
        print url
        try:
            my_headers = ["Mozilla/5.0 (Windows NT 6.3; Win64; x64) 。。。 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"]
            r = getContent(url, my_headers)
            print r

            name_keyword =re.compile(r'''(.*?)''', re.U|re.S)
            name = ''.join(name_keyword.findall(r))

            telphone_keyword = re.compile(r'''电话(.*?)''', re.U|re.S)
            telphone = ''.join(telphone_keyword.findall(r))
            print telphone

            email_keyword = re.compile(r'''邮箱, re.U|re.S)
            email = ''.join(email_keyword.findall(r))
            print email

            #print name
            address_keyword = re.compile(r'''地址(.*?)''',re.U|re.S)
            address = ''.join(address_keyword.findall(r))
            print address

            picture_keyword = re.compile(r'''style="cursor:default;">''', re.U|re.S)
            picture = ''.join(picture_keyword.findall(r))
            #print type(picture)
            picture = "http://homepage.hit.edu.cn" + picture[1:-1]

            #print picture
            zhicheng_keyword = re.compile(r''' 
(.*?)''', re.U|re.S) zhicheng = ''.join(zhicheng_keyword.findall(r)) #print zhicheng #major_keyword = re.compile(r'''(.*?)''', re.U|re.S) #major = major_keyword.findall(r) #print major xueyuan_keyword = re.compile(r'''目前就职(.*?)''', re.U|re.S) xueyuan = ''.join(xueyuan_keyword.findall(r)) #print xueyuan information = str(name) + " " + " " + str(zhicheng) + " " + str(xueyuan) + " " + email + " " + telphone + " " + address + " " + str(picture) f0 = open("hitteacherspersonalinfromation_diff.txt",'a+') #f2 = open("hitteacherspersonalinfromation_picture.txt", 'r+') f6 = open("hitteacherspersonalinfromation.txt", 'a+') #print >> f0, name #print >> f2, picture print >> f6, information print information flag+=1 #f0.close() #f2.close() f6.close() if xueyuan != None and zhicheng != None: cursor.execute('INSERT INTO PersonalInfo_1(name,zhicheng,xueyuan,picture,tel,email,address)VALUES (%s,%s,%s,%s,%s,%s,%s)',(name, zhicheng, xueyuan, picture,telphone,email,address)) db.commit() print "SAVE IT!" else : print "SAVE FAIL" print >>f0, url f0.close() except: flag += 1 f7 = open("fail.txt", 'r+') print >> f7, url f7.close() print "requests error" pass f.close() print "END" def getContent(url, headers): random_header = random.choice(headers) req = urllib2.Request(url) req.add_header("User-Agent", random_header) req.add_header("GET", url) req.add_header("Host", "blog.csdn.net") req.add_header("Referer", "http://www.csdn.net/") content = urllib2.urlopen(req).read() return content if __name__=='__main__': main()

你可能感兴趣的:(python,爬虫)