官方中文帮助文档:
1. requests1.1.0快速上手
2. requests1.1.0高级用法
官方中文帮助文档:
BeautifulSoup
使用BeautifulSoup的例子
def regUrls():
html_file = open('E:\WORK_FILE\Python\Python2\userful\coursera\coursera.html').read()
soup = BeautifulSoup(html_file,'lxml')
course_item_header_div_tags = soup.find_all('div',class_='course-item-list-header')
'''result = {}
count = 0
for div_tag in course_item_header_div_tags:
count += 1
week_course_name = str(count) + '-week-' + div_tag.h3.contents[1][2:].encode('utf8')
result[week_course_name] = {}
ul_tag = div_tag.next_sibling
li_tags = ul_tag.find_all('li')
for li_tag in li_tags:
lecture_name = li_tag.a.string.encode('utf8')
#print lecture_name
lecture_name = re.search(r'\b[a-zA-Z ]+\b',lecture_name).group(0)
lecture_view_link = li_tag.a.get('data-modal-iframe').encode('utf8')
result[week_course_name][lecture_name] = []
result[week_course_name][lecture_name].append(lecture_view_link)
for a_tag in li_tag.div.find_all('a'):
href = a_tag.get('href').encode('utf8')
if 'download.mp4' not in href:
result[week_course_name][lecture_name].append(href)
return result
def genXml(res_dic):
week_indent = 4
week_name_indent = 4 * 2
lecture_indent = 4 * 2
lecture_name_indent = 4 * 3
url_indent = 4 * 3
xml = ''
xml += '\n'
week_name_keys = res_dic.keys()
week_name_keys.sort(key=lambda item:int(item[:item.find('-')]))
for week_name in week_name_keys:
xml += '\n' + ' ' * week_indent + ''
xml += '\n' + ' ' * week_name_indent + '%s ' % week_name
lectures = res_dic[week_name]
lecture_name_keys = lectures.keys()
lecture_name_keys.sort()
#print lecture_name_keys
for lecture_name in lecture_name_keys:
xml += '\n' + ' ' * lecture_indent + ''
xml += '\n' + ' ' * lecture_name_indent + '%s ' % lecture_name
urls = lectures[lecture_name]
for url in urls:
xml += '\n' + ' ' * url_indent + ' ' % url
xml += '\n' + ' ' * lecture_indent + ''
xml += '\n' + ' ' * week_indent + ''
xml += '\n'
with open('course.xml','w') as fd:
fd.write(xml)
return xml
搜索
bs提供了丰富的搜索方法:
遍历文档树
修改文档树
编码检测模块