关于爬虫的一个入门博客:
http://blog.sina.com.cn/s/blog_63cf1c510101dshu.html
BeautifulSoup的使用:
http://wiki.jikexueyuan.com/project/python-crawler-guide/beautiful-soup.html
https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html
关于编码的一些问题:
https://www.cnblogs.com/nyist-xsk/p/7732279.html
自己实现的爬取http://www.resgain.net/xmdq.html上人名的脚本:
#!/usr/bin/bash
# -*- coding: utf-8 -*-
import re
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#根据指定的URL获取网页内容
def gethtml(url):
req = urllib2.Request(url)
response = urllib2.urlopen(req)
html = response.read()
return html
#获取分页数据
def getname(html):
bs=BeautifulSoup(html)
tmp=bs.find_all('a',target='_blank')
#rel=u'([\u4E00-\u9FA5]+?)'
rel=r'target=\"_blank\"\>(.+?)\<'
names=re.findall(rel,str(tmp))
return names
def save(url):
html=gethtml(url)
pname=getname(html)
global fo
for x in pname:
#print x.decode("unicode_escape")
#fo.write(x.decode('unicode_escape')+'\n')
fo.write(x.decode('utf-8')+'\n')
#获取主页分类
def getmain(html):
bs=BeautifulSoup(html)
tmp=bs.find_all('a',class_='html-attribute-value html-external-link')
rel=r'href=\"(http://.[^w][^\"]+?)\"'
tags=re.findall(rel,str(tmp))
return tags
url_main="http://www.resgain.net/xmdq.html"
#html_main=gethtml(url_main)
html_main=open("t.html")
filename="rename.txt"
fo=open(filename,"w")
all_tag=getmain(html_main)
#print all_tag
for i in all_tag:
print i
save(i)
i1=i[:-5]
i2=i[-5:]
for j in range(2,11):
url_child=i1+'_'+str(j)+i2
print url_child
save(url_child)
fo.close()