python 爬虫(二)

抓取hdoj上对应用户做题的昵称和做题的数目,抓取poj上对应用户做题的数目。使用系统为ubuntu kylin 14.04,python版本为2.7x

_author__ = 'fei'
# -*- coding:utf-8 -*-

import urllib
import urllib2
import re
import time

class hdoj(object):
    def __init__(self , ID ):
        self.user_ID = ID
        self.user_agent  = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = {'User-Agent' : self.user_agent}
        self.nums=None
        self.name=None
        self.content=None
    def getPage(self):
        try:
            url = 'http://acm.hdu.edu.cn/userstatus.php?user=' + str(self.user_ID)
            request = urllib2.Request(url , headers= self.headers)
            response = urllib2.urlopen(request)
            self.content =  response.read()
        except urllib2.URLError , e:
            if hasattr (e,'reason'):
                print 'some errors have happened in connecting with the hdoj !' , e.reason
    def getNum(self):
            pattern = re.compile('<tr><td>Problems Solved</td><td align=center>(.*?)</td></tr>',re.S)
            self.nums=re.findall(pattern , self.content)
            return self.nums[0]
    def getName(self):
            pattern = re.compile('<h1 style="color:#1A5CC8" align=center>(.*?)</h1>',re.S)
            self.name=re.findall(pattern , self.content)
            return self.name[0]
spider = hdoj('461807914')
spider.getPage()
print spider.getNum()
print spider.getName()

这里面注意正则表达式。先把读取的网页也就是response.read()输出到终端上面,而不是要对照网页上审查元素中的代码取写正则表达式,这样就能避免字符编码造成的麻烦。比如其中html语句中标签里的括号并不会在终端中显示,这时应该以终端里显示出的内容为主。

_author__ = 'fei'
# -*- coding:utf-8 -*-

import urllib
import urllib2
import re
import time

class poj(object):
    def __init__(self , ID ):
        self.user_ID = ID
        self.user_agent  = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = {'User-Agent' : self.user_agent}
        self.nums=None
        self.content=None
    def getPage(self):
        try:
            url = 'http://poj.org/userstatus?user_id=' + str(self.user_ID)
            request = urllib2.Request(url , headers= self.headers)
            response = urllib2.urlopen(request)
            self.content =  response.read()
        except urllib2.URLError , e:
            if hasattr (e,'reason'):
                print 'some errors have happened in connecting with the poj !' , e.reason
    def getNum(self):
            reg='<td .*?>Solved:</td>[\r\n\.]*<td .*?id='+str(self.user_ID)+'>(.*?)<'
            pattern = re.compile(reg,re.S)
            self.nums=re.findall(pattern , self.content)
            return self.nums[0]

spider = poj('461807914')
spider.getPage()
print spider.getNum()

poj的这个正则真是恶心死我,两个html代码之间的换行如果无法判断,可以插入一个[.\s\n\r]*来连接,这里是字符或者是回车或者是空格的意思。

你可能感兴趣的:(python,爬虫,对象)