学习爬虫的第一天(公司内部oa通讯录初爬)

用了requests和BeautifulSoup

import requests
from bs4 import BeautifulSoup
import re

#首先通过chrome获得headers,包括user-agent和cookie
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Cookie': 'tips=1; V7Qq_2132_smile=1D1; nodeId=_all; nodeType=1; V7Qq_2132_saltkey=jE3E1veZ; V7Qq_2132_lastvisit=1540778092; warning=1; TimeOut=-1; LoginTime=1541143580000; ShowAlert=0; ShowTimeOut=0; V7Qq_2132_editormode_e=1; V7Qq_2132_ulastactivity=55c2hTl9kv0hz5iSWgg5RJkg9kBm4aEroMUZuN29r4gMCYeZ2aiw; V7Qq_2132_forum_lastvisit=D_36_1541377975; V7Qq_2132_visitedfid=36D48D46D131D53D42D45D39D37D40; logintype=4; tivoli_loginname=zhanglei1; acc_valid=1; PD_STATEFUL_e128b204-9490-11e6-9cff-d8d385a4b2ee=%2FMultiSeUn; vfCode=dWBcNp; auth_flag=true; PD-H-SESSION-ID=4_EpcBsRGRc3I-b24ba6HlGbUgDTsjra08n9uHTBdviN28VpKW; loged=4_EpcBsRGRc3I-b24ba6HlGbUgDTsjra08n9uHTBdviN28VpKW; messageStr=%u4E0A%u6B21%u767B%u5F55%u65F6%u95F4%uFF1A%3Cfont%20color%u7B49%u4E8E%27yellow%27%3E2018-11-06%2008%3A20%3A16%3C/font%3E%uFF0CIP%uFF1A%3Cfont%20color%u7B49%u4E8E%27yellow%27%3E10.6.172.174%3C/font%3E; updatesessiontime=1541471002; goto_url=/; AMWEBJCT!%2FportalserverU!JSESSIONID=0000d7oUW9Bm2gzV4dKnkwGH0U0:1a4c3h0hk; LtpaToken=x5qkJ7PIdJ36dd3xp+WPwGG8KyvONhpP6LUAK5mJKm6q+7vewMmlzZsUNch+tED1xN8hjrf6JeJ/mP+G7jlYr4VpPYwLf6FW2ZnHCndRB0NVcpppTbB4+BWwaoI5cs/42A+/QIWYCFJpn7L2RJ34eYoQoHNVwr5oWXkbFGArfUWlPjf1p+rEXhk8lAjWHxpHMR500Colf3GTIKKQoIqIwW1AwjsbFuK0SfGzuEh8WI3Iy3VCcxBo8vTEMOHOh4DHJhrJ6esQzRVszXNesWgOP5f1hl/AfBrPbbgNEnuupUj0cxT+PKIUKj0x7uIYM6PQC9h19EnprymCc6dAF0vZxmMnaYeAVfWz; ltpacreatetime=1541471002273; AMWEBJCT!%2Fportalwas1!JSESSIONID=0000_dTG8afK3Dxol1zfMOwv5uy:1a3qoeddj; MssSsoToken=QW6uLAypiih/mW33jw2kbk3JUc903OwJNa3hMDtrZs4=; updatesessiontime=1541471005; mssPortalPreLogin=0; AMWEBJCT!%2Fappserver!JSESSIONID=00009qGbpL_sQKDZicvLb2SAxf_:16saotmp4; PD_STATEFUL_a2d06bb2-9f14-11e1-9654-0050561100ae=%2Fappserver'
}

#获取该部门通讯录的最大页数
url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doSearch&ZDWID=0331500020¤tPage=1&orderIndex=&orderSign=1&str=all&isVirtal=no"
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'lxml')
totalPage = int(soup.find(name='input',attrs={'name':'totalPage'})['value'])
#通过正则表达式获得该部门的总人数
temp_renshu = soup.find(name='b',attrs={'f-fam1'}).string
renshu = int(re.search('\D\D(\d+)\D',temp_renshu).group(1))
print("该部门通讯录共有{}页,{}人".format(totalPage,renshu))

#通过遍历获得每页的元素
pudong_list = []
for i in range(1,totalPage+1):
    url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doSearch&ZDWID=0331500020¤tPage="+str(i)+"&orderIndex=&orderSign=1&str=all&isVirtal=no"
    r = requests.get(url=url,headers = headers)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'lxml')
    for tr in soup.find_all(name='tbody'):
        for td in tr.find_all(name='td'): #通过td能够获取的数据包括第一个空字段在内,每一个人有6个元素
            pudong_list.append(td.text.strip())
# print(pudong_list)

#把需要的要素放入新建的列表中
name_list = []
for f in range(renshu):
    name_list.append(pudong_list[f*6+1]+','+pudong_list[f*6+2]+','+pudong_list[f*6+3]+','+pudong_list[f*6+4]+','+pudong_list[f*6+5])
#最后再把获得的要素导出到文档里
with open('pudong.txt', 'a', encoding='utf-8') as file:
    file.write('\n'.join(name_list))

后续学习计划:
1、其他部门的数据爬取;
2、每个人的进一步信息爬取(包括图片);

11月6日更新:通过爬取信息中获得了最大的页数和部门总人数,优化了相关代码

你可能感兴趣的:(爬虫)