# -*-coding:utf-8-*-
import re
import urllib2
from bs4 import BeautifulSoup
def print_zh(key):
s = "u'%s'" % key
s = eval(s)
print(s)
keyList = [u'项目', u'交流']
keyResult = []
url = 'http://urp.tust.edu.cn/bulletinPageList.jsp?pageNum=1&groupIds=Nyw4'
req = urllib2.Request(url)
res = urllib2.urlopen(req)
soup = BeautifulSoup(res.read(), "lxml")
lists = soup.select('li.an-list')
for li in lists:
lise = li.select('div[class="an-title block"]')
if lise:
te = re.findall(r'title="(.*)"', str(lise))[0]
for key in keyList:
tempkey = str(repr(key)) # 正则经常用到的repr函数,要查看在Python内部到底是怎么表示的 类似于 u'\u5c31\u4e1a'
tempkey = repr(tempkey) # 这一步把转义字符暴露出来,方便匹配 类似于 u'\\u5c31\\u4e1a'
tempkey = tempkey[3:tempkey.__len__() - 2] # 这一步把前面的 u' 和后面的 ' 删掉
if re.search(r'' + tempkey + '', te):
# 打印中文title
print_zh(te)
lise2 = li.select('div[class="dep-angency block"]')
herf = re.findall(r'href="(.*)"\s', str(lise2))
# 打印对应链接
print(herf)
depart = lise2[0].select('a.deptlink')[0].get_text()
# 打印对门
print(depart)
date_d = li.select("p")
# 打印日期
print(date_d[0].get_text())
print('\n')
break
参考 http://www.mamicode.com/info-detail-1377315.html