了解http协议可以了解阮一峰老师的下面3篇文章
http协议入门
互联网协议入门
图解SSL/TLS协议
HTTP请求(请求行+请求头+空行+[消息体])
HTTP响应(响应行+响应头+空行+消息体)
POSTMAN 可以使用restful风格像后台请求数据
HTTPie 相当于命令行版的postman
BuiltWith:识别网站使用的技术
import builtwith
builtwith.parse('http://www.bootcss.com/')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
builtwith.parse('https://www.jianshu.com/')
import whois
whois.whois('www.baidu.com')
正则表达式 - 性能好 但复杂
pyquery - jQuery - css选择器取元素
#foo -
.foo -
p -
p h2 -
p>h2
p~h2 -
p+h2 -
BeautifulSoup - bs4
lxml - libxml2 - XPath 性能比前两个好
pip install lxml bs4
bs4 能够用lxml做解析器,性能更好
import re
from urllib.error import URLError
from urllib.request import urlopen
import pymysql
def get_page_code(start_url, *, retry_times=3, charsets=('utf-8',)): # 起始url, 尝试次数, 编码
# 起点, 可以在要爬取的网站随便选一个作为起点
try:
# 进行多种方式的解码
for charset in charsets:
try:
html = urlopen(start_url).read().decode(charset)
break
except UnicodeDecodeError:
html = None
except URLError as e:
print('Error:', e)
return get_page_code(start_url, retry_times=retry_times - 1, charsets=('utf-8', 'gbk', 'gb2312')) if retry_times > 0 else None
return html
def main():
url_list = ['http://sports.sohu.com/nba_a.shtml']
# 为了防止重复爬取,要将访问过得页面加到集合中
# 集合可以自动去重
visited_list = set({})
while len(url_list) > 0:
current_url = url_list.pop(0)
visited_list.add(current_url)
html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
if html:
# 匹配正则表达式的内容, 并取()内的内容
link_regex = re.compile(r']+test=a\s[^>]*href=["\'](\S*)["\']', re.IGNORECASE)
link_list = re.findall(link_regex, html)
url_list += link_list
conn = pymysql.connect(host='localhost', port=3306,
db='crawler', user='root',
charset='utf8',
passwd='123456')
try:
for url in link_list:
if url not in visited_list:
visited_list.add(url)
print(url)
ht = get_page_code(url, charsets=('utf-8', 'gbk', 'gb2312'))
title_regex = re.compile(r'(.*), re.IGNORECASE)
# title = re.findall(title_regex, ht)[0]
match_list = title_regex.findall(ht)
if len(match_list) > 0:
title = match_list[0]
with conn.cursor() as cursor:
cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)', (title, url))
conn.commit()
finally:
conn.close()
print('执行完成')
if __name__ == '__main__':
main()
# 重构代码
from urllib.error import URLError
from urllib.request import urlopen
import re
import pymysql
import ssl
from pymysql import Error
# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
def decode_page(page_bytes, charsets=('utf-8',)):
page_html = None
for charset in charsets:
try:
page_html = page_bytes.decode(charset)
break
except UnicodeDecodeError:
pass
# logging.error('Decode:', error)
return page_html
# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):
page_html = None
try:
page_html = decode_page(urlopen(seed_url).read(), charsets)
except URLError:
# logging.error('URL:', error)
if retry_times > 0:
return get_page_html(seed_url, retry_times=retry_times - 1,
charsets=charsets)
return page_html
# 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)
def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
pattern_regex = re.compile(pattern_str, pattern_ignore_case)
return pattern_regex.findall(page_html) if page_html else []
# 开始执行爬虫程序并对指定的数据进行持久化操作
def start_crawl(seed_url, match_pattern, *, max_depth=-1):
conn = pymysql.connect(host='localhost', port=3306,
database='crawler', user='root',
password='123456', charset='utf8')
try:
with conn.cursor() as cursor:
url_list = [seed_url]
visited_url_list = {seed_url: 0}
while url_list:
current_url = url_list.pop(0)
depth = visited_url_list[current_url]
if depth != max_depth:
page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
links_list = get_matched_parts(page_html, match_pattern)
param_list = []
for link in links_list:
if link not in visited_url_list:
visited_url_list[link] = depth + 1
page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312'))
headings = get_matched_parts(page_html, r'(.*))
if headings:
param_list.append((headings[0], link))
cursor.executemany('insert into tb_result values (default, %s, %s)',
param_list)
conn.commit()
except Error:
pass
# logging.error('SQL:', error)
finally:
conn.close()
def main():
ssl._create_default_https_context = ssl._create_unverified_context
start_crawl('http://sports.sohu.com/nba_a.shtml',
r']+test=a\s[^>]*href=["\'](.*?)["\']',
max_depth=2)
if __name__ == '__main__':
main()
from bs4 import BeautifulSoup
import requests
def main():
'http://sports.sohu.com/nba_a.shtml'
resp = requests.get('http://sports.sohu.com/nba_a.shtml')
html = resp.content.decode('gbk')
# 用lxml解析
bs = BeautifulSoup(html, 'lxml')
# bs.body.h1 拿到body下的h1
# bs.select('div p') 选择器
# bs.find_all(re.compile(r'^b')) 以b开头的标签
# bs.find(id='bar') 找标签, 找单个
# bs.find_all('p', {'class':'foo'}) 找到类为foo的p标签 返回一个列表
# bs.select('a[href]') 找到有href属性的a标签
# for elem in bs.select('a[href]'):
# 取到href中的内容
# print(elem.attrs['href'])
if __name__ == '__main__':
main()
import re
from bs4 import BeautifulSoup
import requests
def main():
'http://sports.sohu.com/nba_a.shtml'
# 通过requests第三方库的get方法获取页面
resp = requests.get('http://sports.sohu.com/nba_a.shtml')
# 对响应的字符串(bytes)进行解码操作(搜狐的部分页面使用了gbk编码)
html = resp.content.decode('gbk')
# 用lxml解析 创建BeatulifulSoup对象来解析页面(相当于JavaScript的DOM)
bs = BeautifulSoup(html, 'lxml')
# 通过CSS选择器语法查找元素并通过循环进行处理
for elem in bs.select('a[test=a]'):
# 通过attrs属性(字典)获取元素的属性值
link_url = elem.attrs['href']
resp = requests.get(link_url)
bs_sub = BeautifulSoup(resp.text, 'lxml')
# .text 标签里面的内容
# attrs 标签的属性
# 将文本中的\r\n 替换成''
# 使用正则表达对获取的数据做进一步的处理
print(re.sub(r'[\r\n]', '', bs_sub.select_one('h1').text))
if __name__ == '__main__':
main()
# 天行新闻链接打印
import requests
from bs4 import BeautifulSoup
def get_page_code(start_url, *, retry_time=3, charsets=('gb2312',)):
try:
for charset in charsets:
try:
resp = requests.get(start_url)
html = resp.content.decode(charset)
break
except UnicodeDecodeError:
html = None
except Exception as e:
print('Error:', e)
return get_page_code(start_url, retry_time=retry_time - 1, charsets=charsets)
return html
def main():
'http://news.sohu.com/20171226/n526348972.shtml'
"""
-
17-09-10
综述:腾讯上线查小号功能,网吧下载H1Z1被公安查处
"""
url_list = ['http://news.sohu.com/20171226/n526348972.shtml']
visited_list = set()
while len(url_list) > 0:
current_url = url_list.pop(0)
print(current_url)
if current_url not in visited_list:
visited_list.add(current_url)
html = get_page_code(current_url, charsets=('gb2312', 'utf-8', 'gbk', 'ascii'))
if html:
soup = BeautifulSoup(html, 'lxml')
for elem in soup.select('div[class="mutu-news"] a'):
new_url = elem.attrs['href']
if new_url not in visited_list and new_url not in url_list:
url_list.append(new_url)
if __name__ == '__main__':
main()
"""
爬取知乎的推荐内容
data-za-element-name="Title"
url:https://www.zhihu.com/explore/recommendations
resp = requests.get(main_url,headers=headers)
如何评价 NBA 17-18 赛季西部决赛 G5 勇士 94:98 火箭,总比分火箭 3:2 领先?
"""
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import pymysql
def get_page_code(start_url, *, retry_time=3, charsets=('utf-8',)):
try:
for charset in charsets:
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}
resp = requests.get(start_url, headers=headers)
html = resp.content.decode(charset)
break
except UnicodeDecodeError as e:
html = None
except Exception as e:
print('Error:', e)
return get_page_code(start_url, retry_time=retry_time - 1, charsets=charsets) if retry_time > 0 else None
return html
def main():
url_list = []
start_url = 'https://www.zhihu.com/explore/recommendations'
html = get_page_code(start_url, charsets=('utf-8', 'gbk', 'gb2312'))
soup = BeautifulSoup(html, 'lxml')
for elem in soup.select('a[class="question_link"]'):
new_url = elem.attrs['href']
new_url = 'https://www.zhihu.com' + new_url
url_list.append(new_url)
while len(url_list) > 0:
visited_list = set()
current_url = url_list.pop(0)
if current_url not in visited_list:
visited_list.add(current_url)
html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
conn = pymysql.connect(host='localhost', port=3306,
db='crawler', user='root',
charset='utf8',
passwd='123456')
if html:
soup = BeautifulSoup(html, 'lxml')
elem = soup.select_one('h1[class="QuestionHeader-title"]')
if elem:
title = elem.text
content = ''
for elem in soup.select('span[class="RichText ztext CopyrightRichText-richText"] p'):
content += elem.text
now_data = datetime.now()
with conn.cursor() as cursor:
cursor.execute('insert into tb_questions (title, url, content, create_time) values (%s, %s, %s, %s)', (title, current_url, content, now_data))
conn.commit()
if __name__ == '__main__':
main()