爬取网站的流程:
注意事项:
import requests,re
from lxml import etree
base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page=%s'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_value(value):
if value:
return value[0]
return ''
result = []
for i in range(1,4):
response = requests.get(base_url%i,headers=headers)
html = etree.HTML(response.text)
# print(etree.tostring(html,pretty_print=True,encoding='utf-8').decode('utf-8'))
tr_list = html.xpath('//tbody/tr[@class="row"]')
for tr in tr_list:
item = {}
word = get_value(tr.xpath('.//td[@class="span2"]/strong/text()'))
mean = get_value(tr.xpath('.//td[@class="span10"]/text()'))
item[word] = mean
result.append(item)
print(result)
import requests,json
from lxml import etree
class Shanbei(object):
def __init__(self,url,headers):
self.url = url
self.headers = headers
self.result = []
self.word_mean()
self.save_data()
def get_value(self,value):
if value:
return value[0]
return ''
def word_mean(self):
for i in range(1, 4):
response = requests.get(self.url % i, headers=headers)
html = etree.HTML(response.text)
# print(etree.tostring(html,pretty_print=True,encoding='utf-8').decode('utf-8'))
tr_list = html.xpath('//tbody/tr[@class="row"]')
for tr in tr_list:
item = {}
word = self.get_value(
tr.xpath('.//td[@class="span2"]/strong/text()'))
mean = self.get_value(tr.xpath('.//td[@class="span10"]/text()'))
item[word] = mean
self.result.append(item)
def save_data(self):
with open('shanbei_word.json','w',encoding='utf-8') as fp:
json.dump(self.result,fp)
if __name__ == '__main__':
base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page=%s'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
Shanbei(base_url,headers)
with open('shanbei_word.json','r') as fp:
result = json.load(fp)
print(result)
可迭代对象:有__iter__
属性的对象。
迭代器:有__next__
属性的对象。
两个可以转换吗?
iter(可迭代对象)---->返回值为迭代器
可迭代对象都有哪些?
打印一个文件,同时输出行号
fp = open('shanbei_word.py','r',encoding='utf-8')
print(fp)
for i,content in enumerate(fp,1):
print(i,content)
import requests
from lxml import etree
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_xpath(url):
response = requests.get(url,headers=headers)
return etree.HTML(response.text)
# 获取歌手介绍
def get_info(url,item):
html = get_xpath(url)
introduce_list = html.xpath('//div[@class="n-artdesc"]/p/text()')
introduce = ''.join(introduce_list)
item['introduce'] = introduce
result.append(item)
# 获取歌手
def get_single(url):
html = get_xpath(url)
single_names = html.xpath('//ul[@id="m-artist-box"]/li/p/a[1]/text()|//ul[@id="m-artist-box"]/li/a[1]/text()')
single_urls = html.xpath('//ul[@id="m-artist-box"]/li/p/a[1]/@href|//ul[@id="m-artist-box"]/li/a[1]/@href')
for i,name in enumerate(single_names):
item = {}
item['name'] = name
item['url'] = 'https://music.163.com' + single_urls[i].replace(' ','')
# 获取歌手介绍
url = item['url'].replace('?','/desc?')
get_info(url,item)
# 获取姓名首字母分类页面
def get_type_page(url):
html = get_xpath(url)
nametype_url_list = html.xpath('//ul[@id="initial-selector"]/li[position()>1]/a/@href')
for one in nametype_url_list:
url = 'https://music.163.com' + one
# 获取歌手
get_single(url)
# 注意锚点
base_url = 'https://music.163.com/discover/artist'
# 获取歌手地区分类页面url
def get_type():
html = get_xpath(base_url)
localtype_url_list = html.xpath('//ul[@class="nav f-cb"]/li/a[contains(@href,"id")]/@href')
for one in localtype_url_list:
url = 'https://music.163.com'+ one
# 获取姓名首字母分类页面
get_type_page(url)
if __name__ == '__main__':
result = []
get_type()
print(result)
import requests,json
from lxml import etree
class Music(object):
def __init__(self,base_url):
self.base_url = base_url
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
self.result = []
self.get_type()
# 注意锚点
def get_xpath(self,url):
response = requests.get(url, headers=self.headers)
return etree.HTML(response.text)
# 获取歌手地区分类页面url
def get_type(self):
html = self.get_xpath(self.base_url)
localtype_url_list = html.xpath(
'//ul[@class="nav f-cb"]/li/a[contains(@href,"id")]/@href')
for one in localtype_url_list:
url = 'https://music.163.com' + one
# 获取姓名首字母分类页面
self.get_type_page(url)
# 获取姓名首字母分类页面
def get_type_page(self,url):
html = self.get_xpath(url)
nametype_url_list = html.xpath(
'//ul[@id="initial-selector"]/li[position()>1]/a/@href')
for one in nametype_url_list:
url = 'https://music.163.com' + one
# 获取歌手
self.get_single(url)
# 获取歌手
def get_single(self,url):
html = self.get_xpath(url)
single_names = html.xpath(
'//ul[@id="m-artist-box"]/li/p/a[1]/text()|//ul[@id="m-artist-box"]/li/a[1]/text()')
single_urls = html.xpath(
'//ul[@id="m-artist-box"]/li/p/a[1]/@href|//ul[@id="m-artist-box"]/li/a[1]/@href')
for i, name in enumerate(single_names):
item = {}
item['name'] = name
item['url'] = 'https://music.163.com' + single_urls[i].replace(' ','')
# 获取歌手介绍
url = item['url'].replace('?', '/desc?')
self.get_info(url, item)
# 获取歌手介绍
def get_info(self,url, item):
html = self.get_xpath(url)
introduce_list = html.xpath('//div[@class="n-artdesc"]/p/text()')
introduce = ''.join(introduce_list)
item['introduce'] = introduce
self.result.append(item)
if __name__ == '__main__':
base_url = 'https://music.163.com/discover/artist'
m = Music(base_url)
with open('singer.json','w',encoding='utf-8') as fp:
json.dump(m.result,fp)
解决办法:将user-agent封装到请求头中
解决办法:设置爬取间隔
n = random.randint(5)
time.sleep(n)
解决办法:设置代理ip
解决办法:使用selenium+phantomjs可以获取页面数据
selenium:是web自动测试的工具。
phantomjs:是一个无界面的浏览器,所以它可以运行js代码,帮我们拿到页面数据。
它们配合使用就可以解决页面是js代码的数据获取问题
下载和安装
复制到anaconda的script目录下,然后执行
pip install selenium==2.48.0
为了使测试工具与浏览器交互,需要使用ChromeDriver
查看google版本号
选择大版本号对应,小版本号最接近的
解压后同样放在anaconda的script目录中
html是页面的骨架,css是页面的装饰,js是页面的灵魂。
jquery是一个库,可以使js代码更加简化。
是一种技术,web页面的异步请求。
DHTML是Dynamic HTML的缩写,意思是动态的HTML。它并不是一门独立的语言,实际上任何可以实现页面动态改变的方法都可以成为DHTML。