安装Python以及必要的模块(requests,xpath)
新笔趣阁
import requests
import time
import sys
from lxml import etree
# 首先获取访问网站的URl
def get_content(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
content = r.text
print(content)
return content
except: # 反应错误信息
s = sys.exc_info()
print("Error '%s' happened on line %d" % (s[1], s[2].tb_lineno))
return " ERROR "
# 解析得到的内容 今天主要学习方面
def get_analysis(content):
print(type(content))
ele = etree.HTML(content)
# print(type(ele))
result = ele.xpath("//div[@id='content']/text()")
print(result)
finishedProduct = "\n".join(result)
print(finishedProduct)
save(finishedProduct)
# for result in result:
# element = result.xpath("br")[0]
# print(len(element))
# print(type(element))
# 写入文档
def save(finishedProduct):
filename = "元尊.txt"
f = open(filename, "a+", encoding='utf-8')
f.write(finishedProduct+'\n')
f.close
# 主程序
def main():
start_time = time.time()
content = get_content('https://www.xsbiquge.com/78_78513/108078.html')#限制了搜索的范围
get_analysis(content)
end_time = time.time()
project_time = end_time - start_time
print('程序用时', project_time)
main()
缺点:
1.代码繁琐用时长
2.单一无法变通
3.基础严重漏洞
总结:写的超级潦草,原本以为xpath学的很好没想到折磨简单的搞死我了半天。每天需要继续写这个爬虫奥力给!!