腾讯和头条的爬虫代码

腾讯:

 

import requests,re
import requests
import json
import pymysql

class mysql_conn(object):
    # 魔术方法, 初始化, 构造函数
    def __init__(self):
        self.db = pymysql.connect(host='127.0.0.1',user = 'root',password = '123456',port = 3306,database='py11_mysql')
        self.cursor = self.db.cursor()
    # 执行modify(修改)相关的操作
    def execute_modify_mysql(self, sql):
        self.cursor.execute(sql)
        self.db.commit()
    # 魔术方法, 析构化 ,析构函数
    def __del__(self):
        self.cursor.close()
        self.db.close()


def tx(sql):
    page = ''
    p_page = 0

    i = 0
    while i <= sql:
        url = 'https://hr.tencent.com/position.php?keywords=&lid=0'+page+''
        print(url)
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
        }
        response = requests.get(url,headers = headers)

        html_bytes = response.text

        parttern = ''

        info_url = re.findall(parttern,html_bytes)
        # print(info_url)
        for info_url_list in info_url:
            info_url_all = 'https://hr.tencent.com/'+info_url_list
            # print(info_url_all)
            response = requests.get(info_url_all,headers = headers)
            # print(response.text)
            info_html = response.text
            title_parttern = '(.*?)'
            title = re.search(title_parttern,info_html).group(1)

            place_parttern = '工作地点:(.*?)'
            place = re.search(place_parttern,info_html).group(1)

            job_list_parttern = '职位类别:(.*?)'
            job_list = re.search(job_list_parttern,info_html).group(1)

            job_duty_parttern = '工作职责:
\s.*
  • (.*?)
' job_duty = re.search(job_duty_parttern, info_html).group(1) job_ask_parttern = '工作要求:
\s.*
  • (.*?)
' job_ask = re.search(job_ask_parttern, info_html).group(1) data = {} data['title'] = title data['place'] = place data['job_list'] = job_list data['job_duty'] = job_duty data['job_ask'] = job_ask my = mysql_conn() my.execute_modify_mysql('insert into tx(title,place,job_list,job_duty,job_ask) values ("{title}","{place}","{job_list}","{job_duty}","{job_ask}")'.format(**data)) i += 1 p_page += 10 page = '&start='+str(p_page)+'#a' print(page) tx(10)

头条:

import re
import json
import requests
import os
from urllib import request



url = 'https://www.toutiao.com/a6331698802248909057/'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}

response = requests.get(url,headers = headers)
html_str = response.text
# print(response.text)

pattern = r'gallery: JSON\.parse\((.*)\),'

res_lists = re.search(pattern,html_str)
# print(res_lists.group(1))
path = 'D:/Python/download'
# if not os.path.exists('download'):
#     os.makedirs('download')
# print(os.path)
os.chdir(path)
if res_lists:
    img_lists = json.loads(res_lists.group(1))
    img_dict = json.loads(img_lists)
    # print(img_dict['sub_images'])
    for img in img_dict['sub_images']:
        img_url = img['url']
        filename = img_url.split('/')[-1]+'.jpg'
        request.urlretrieve(img_url,filename)

else:
    pass

你可能感兴趣的:(作业)