1、requests 带headers
import requests
from bs4 import BeautifulSoup
headers = {
’ User-Agent ’:’ Mozilla/5 . 0 (Windows NT 6 .1; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrorne/53.0.2785 . 143 Safari/537.36 ’
}
res = requests.get ( ’ http : //bj . xiaozhu .com/ ’, headers=headers)
print (res .text)
soup= BeautifulSoup(res.text,’ html.parser ’)
print(soup.prettify())
2、beautifulsoup的4种解析库
beautifulsoup.find_all
soup.find all (’ div ’, attrs={ ” class”:” item” } )
beautifulsoup.selector()
p=soup.selector(div.item >a> hl)
p.get_text() #中间文字内容
3、 1 example:爬取北京地区短租房信息
2 example:爬取酷狗TOP500的数据
3 example:爬取斗破苍穹小说
4 example:爬取糗事百科
4、re修饰符
5 xpath
pip install lxml
from lxml import etree
selector=etree.HTML(res.text)
id=selector.xpath('//')
6 5 example:爬取豆瓣图书TOP250的数据
6 example:爬取起点中文网小说信息
7 example:爬取PEXELS图片
8 example:爬取糗事百科的用户地址信息
9 example:爬取豆瓣音乐TOP250的数据
10 example:爬取豆瓣电影TOP250的数据
11 example:爬取网热评文章
12 example:爬取转转网二手市场商品信息
13 example:爬取网用户动态信息
14 example:爬取网7日热门信息
15 example:爬取拉钩网招聘信息
16 example:爬取新浪微博好友圈信息
pip install jieba
TAGUL词云制作
17 example:
xiaozhuspider.py????
18 example:爬取网热门专题信息
19 example:爬取知乎网python精华话题
20 example:爬取网专题收录文章
CREATE TABLE jianshul (
USER TEXT,
TIME TEXT,
title TEXT,
VIEW TEXT,
COMMENT TEXT,
lik TEXT,
gain TEXT
)ENGINE INNODB DEFAULT CHARSET=utf8;
21 example:爬取网推荐信息
7 json.loads()
x.get()
8 pymongo
c=pymongo.MongoClient('localhost',27017)
x=c['username']
x.insert_one({'a':1,'x':2})
mongoexport -d mydb -c test --csv -f name,sex,grade -o test.csv
9 pymysql
import pymysql
conn=pymysql.connect(host='',user=,passwd=,db=,port=,charaset='')
cursor=conn.cursor()
cursor.execute('xx')
conn.commit()
10 多进程
from multiprocessing import Pool
pool=Pool(processes=4)
pool.map(func,iterable,[,chunksize])
11 cookie
12 jieba词频分析
13 selenium
driver.implicitly_wait(10)
driver.switch_to_frame()
14 csv.DictReader(fp)
for row in reader: