简单网络爬虫教程

爬虫

使用程序来获取我们需要的网络上的内容,如文字,视频,图片等信息 等。

使用工具

Python3.7 Pycharm

编写流程

步骤一:请求网站,下载网页内容

url = "http://www.baidu.com"
#如果访问的网页地址有规律,可以批量的构造网页
#如:urls =['http://xa.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1,5)]
print("第一种方法")
response1 = urllib.request.urlopen(url)
# 获取状态码,200表示成功
print(response1.getcode())
# 获取网页内容的长度
print(len(response1.read()))

print("第二种方法")
request_1 = urllib.request.Request(url)
# 模拟Mozilla浏览器进行爬虫
request_1.add_header("user-agent", "Mozilla/5.0")
response2 = urllib.request.urlopen(request_1)
print(response2.getcode())
print(len(response2.read()))

print("第三种方法,使用第三方库requests")
def start_requests(url):
    headers = {
        'User - Agent': 'Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 63.0.3239.132 Safari / 537.36'
    }
    response = requests.get(url, headers=headers)
    #获取状态码,200表示成功
    print(response.status_code)
    return response.content.decode()
html=start_requests(url)
print(len(html))

步骤二:使用BeautifulSoup解析网页

1.使用soup.select()函数搜索标签

def get_text(urls,data = None):
    web_data = requests.get(urls).content.decode()
    soup = BeautifulSoup(web_data, 'lxml')
    # 获取页面标题
    title = soup.select('title')
    # 获取正文
    texts = soup.select('div>p',class_="article-intro")#搜索p标签
    #texts = soup.select('li')#s搜索li标签
    #texts = soup.select('div>ul>li')  # 与上面方法等价
    print(len(texts))
    for text in texts:
        data = {         
            'text': text.get_text()#可以获取子标签内容(常用)
        }
        print(data)
get_text(url)

2.通过soup.find_all()搜索获取标签内容

links = soup.find_all('a')
print ("所有的链接")
for link in links:
    print (link.name,link['href'],link.get_text())

3.通过soup.find()和正则表达式匹配解析

  创建一个BeautifulSoup解析对象
soup = BeautifulSoup(html_doc,"html.parser",from_encoding="utf-8")
 
print ("获取特定的URL地址")
link_node = soup.find('a',href="http://example.com/elsie")
print (link_node.name,link_node['href'],link_node['class'],link_node.get_text())
 
print ("正则表达式匹配")
link_node = soup.find('a',href=re.compile(r"ti"))
print (link_node.name,link_node['href'],link_node['class'],link_node.get_text())
 
print ("获取P段落的文字")
p_node = soup.find('p',class_='story')
print (p_node.name,p_node['class'],p_node.get_text())


步骤三:将解析内容保存或打印

1.保存在txt中

 f = open('content.txt', 'a+', encoding="utf-8")  # 以追加形式打开文件
 f.write(text.get_text())
 f.close()

2.保存在excel中

#---部分代码----
	import xlwt
	import xlrd	
	r_xls = xlrd.open_workbook('xian.xls')
    excel=copy(r_xls)
    table=excel.get_sheet(0)
	workbook = xlwt.Workbook(encoding='utf-8')
    sheet1 = workbook.add_sheet('西安')
	able.write(row, 0, row)
	table.write(row, 1, title.string.strip())
	table.write(row, 2, address.string.strip())
	table.write(row, 3, url)
	excel.save('xian.xls')
	workbook.save('xian.xls')

你可能感兴趣的:(爬虫,python)