首先注意,学习新东西,需要迅速的成就感,所以有其他编程语言基础或者略懂的同志们,可以直接上手写代码,哪里不会学哪里,先搞个基本例子,有结果的;之后在继续深入研究;
环境:idea编辑器,python3
其实天下文章一大抄,代码也能先抄抄;先爽了再说;
from urllib import request
if __name__ == "__main__":
response = request.urlopen("https://www.cnblogs.com/panzi/p/6421826.html")
html = response.read()
html = html.decode("utf-8")
print(html)
各种需要模块导入,请百度
# 抓取菜鸟网址的个个技术页面信息子信息及相关描述
# 升级----写入文件中
# 升级----写入数据库
from urllib import request
import urllib
import re
import pymysql
# 数据连接
def testMysql(value1):
conn = pymysql.connect(host='localhost', port=3306, user='root', password='123', db='test', charset='utf8')
cu = conn.cursor()
#cu.execute('select * from resource_biz_chain')
try:
cu.execute('insert into runoob2(title1,url,description) values(%s,%s,%s)',value1)
print(cu)
conn.commit()
except Exception as e:
print(e)
#cu.fetchall()
#res = cu.fetchall()
cu.close()
conn.close()
# 打开url连接,返回页面代码
def openurl(url):
response = request.urlopen(url)
html = response.read()
html = html.decode("utf-8")
return html
f1 = ''
f2 = ''
urlList2 = ''
if __name__ == "__main__":
#testMysql()
reg = r'(.*?) '
urlList = re.findall(reg, openurl("http://www.runoob.com/python3/python3-tutorial.html"))
for iurl, iname in urlList:
if iname == '首页' or iname == '更多……' or iname == '用户登录' or iname == '注册新用户':
print('无用的url:' + iurl + iname)
else:
htmlp = "http://www.runoob.com" + iurl
print('需要的技术标签:' + iname + ',url:' + htmlp)
reg1 = r''
urlList1 = re.findall(reg1, openurl(htmlp))
for urlname, urlson in urlList1:
global f2
global urlList2
global f1
htmlson = 'http://www.runoob.com/' + urlson + '.html'
print(urlname + '--' + htmlson)
f1=str(urlname)
f2 = str(htmlson)
ex = r''
fa= str(re.findall(ex, openurl(htmlson)))
urlList2 = fa
print(str(urlList2))
# 写入文件,成功
# with open('D:\\url.txt',mode='a+',encoding='utf-8') as f:
# f.write(str(f1)+'\n')
# f.write(str(f2)+'\n')
# f.write(str(urlList2)+'\n')
# f.close()
value1=[f1,f2,urlList2]
testMysql(value1)
初学总结:功能实现了–完美,后期优化点:文件写入/数据库写入–如何节省资源,加快写入速度;
优化总结:
爬取优化—-调度器,防止被封的反爬策略;
展示优化—-调用词云,前台显示等,形象展示数据;