爬取大学公告信息 beautifulsoup的使用

# -*-coding:utf-8-*-
import re
import urllib2

from bs4 import BeautifulSoup


def print_zh(key):
    s = "u'%s'" % key
    s = eval(s)
    print(s)

keyList = [u'项目', u'交流']
keyResult = []
url = 'http://urp.tust.edu.cn/bulletinPageList.jsp?pageNum=1&groupIds=Nyw4'
req = urllib2.Request(url)
res = urllib2.urlopen(req)
soup = BeautifulSoup(res.read(), "lxml")
lists = soup.select('li.an-list')
for li in lists:
    lise = li.select('div[class="an-title block"]')
    if lise:
        te = re.findall(r'title="(.*)"', str(lise))[0]
        for key in keyList:
            tempkey = str(repr(key))   # 正则经常用到的repr函数,要查看在Python内部到底是怎么表示的 类似于 u'\u5c31\u4e1a'
            tempkey = repr(tempkey)    # 这一步把转义字符暴露出来,方便匹配  类似于  u'\\u5c31\\u4e1a'
            tempkey = tempkey[3:tempkey.__len__() - 2]   # 这一步把前面的 u' 和后面的 ' 删掉
            if re.search(r'' + tempkey + '', te):
                # 打印中文title
                print_zh(te)
                lise2 = li.select('div[class="dep-angency block"]')
                herf = re.findall(r'href="(.*)"\s', str(lise2))
                # 打印对应链接
                print(herf)
                depart = lise2[0].select('a.deptlink')[0].get_text()
                # 打印对门
                print(depart)
                date_d = li.select("p")
                # 打印日期
                print(date_d[0].get_text())
                print('\n')
                break

参考 http://www.mamicode.com/info-detail-1377315.html

你可能感兴趣的:(python,爬虫)