Python 爬虫入门
本文主要是对 Python 爬虫知识的一些梳理
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import codecs
import csv
def getHTML(url):
r = requests.get(url)
return r.content
def parseHTML(html):
soup = BeautifulSoup(html,'html.parser')
body = soup.body
company_middle = body.find('div',attrs={'class':'middle'})
company_list_ct = company_middle.find('div',attrs={'class':'list-ct'})
company_list = []
for company_ul in company_list_ct.find_all('ul',attrs={'class':'company-list'}):
for company_li in company_ul.find_all('li'):
company_url = company_li.a['href']
company_info = company_li.get_text()
company_list.append([company_info,company_url])
return company_list
def writeCSV(file_name,data_list):
with open(file_name,'w',encoding='utf-8') as f:
writer = csv.writer(f)
for data in data_list:
writer.writerow(data)
if __name__ == "__main__":
url = 'http://www.cninfo.com.cn/cninfo-new/information/companylist'
html = getHTML(url)
data_list = parseHTML(html)
writeCSV('test.csv',data_list)