爬虫实例1:2018中国最富1000人名单及信息(table格式)

前两天正好看到这个新闻,顺手就爬一下,用的re正则表达式爬的,用BeautifulSoup会更省力一些

所需爬取链接:http://finance.sina.com.cn/zt_d/jmzf2018/

网页为静态页面,为表结构,不需要翻页,较简单

使用到的模块:

re  # 正则
urllib.request # 亦可用request模块,今天看了urllib就正好用了
openpyxl.Workbook # 在python中操作excel

源码:

import re
import urllib.request
from openpyxl import Workbook
#   使用re 爬取2018中国最有钱的1000人网站中的表格


url = 'http://finance.sina.com.cn/zt_d/jmzf2018/'
Sheet1_title = '2018年中国最有钱1000人'

def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    data = response.read().decode('utf-8')
    return data


def get_word(data):
    pattern = re.compile(r'.+')
    list = re.findall(pattern, data)
    return list


def do_list(list):
    list_s = []
    s = 0
    list_m = []
    for i in list:
        if s % 7 == 0 and s > 0:
            list_s.append(list_m)
            list_m = []
        i = i.lstrip('')    # 数据格式处理
        i = i.rstrip('')
        i = i.strip(' ')
        i = i.replace('amp;', '')
        list_m.append(i)  # 循环6次,然后将m放入s中
        s += 1
    return list_s


def excel(list_s):
    wb = Workbook()
    ws1 = wb.active
    ws1.title = Sheet1_title

    ws1['A1'] = '排名'
    ws1['B1'] = '姓名'
    ws1['C1'] = '财富(亿元)'
    ws1['D1'] = '年龄'
    ws1['E1'] = '性别'
    ws1['F1'] = '主要财富来源'
    ws1['G1'] = '主要公司所在地'

    s = 2
    for (num, name, money, age, sex, comp, add) in list_s:
        col_A = 'A%s' % s
        col_B = 'B%s' % s
        col_C = 'C%s' % s
        col_D = 'D%s' % s
        col_E = 'E%s' % s
        col_F = 'F%s' % s
        col_G = 'G%s' % s
        ws1[col_A] = num
        ws1[col_B] = name
        ws1[col_C] = money
        ws1[col_D] = age
        ws1[col_E] = sex
        ws1[col_F] = comp
        ws1[col_G] = add
        s += 1
    wb.save(filename='2018年中国最有钱1000人.xlsx')
    return


data = get_html(url)
list = get_word(data)
list_s = do_list(list)
excel(list_s)

 

你可能感兴趣的:(Python)