Python3--批量爬取数据之调用有道api进行翻译

上代码:

  # coding=utf-8
import urllib,urllib.request
from fake_useragent import UserAgent
import json
import time
import hashlib
import urllib.parse
import requests
import random
import csv,re

class YouDaoFanyi:
    def __init__(self, appKey, appSecret):
        self.url = 'https://openapi.youdao.com/api/'
        self.headers = { 'User-Agent':str(UserAgent().random)}
        self.appKey = appKey  # 应用id
        self.appSecret = appSecret  # 应用密钥
        self.langFrom = 'EN'   # 翻译前文字语言,auto为自动检查
        self.langTo = 'zh-CHS'     # 翻译后文字语言,auto为自动检查

    def getUrlEncodedData(self, queryText):
        '''
        将数据url编码
        :param queryText: 待翻译的文字
        :return: 返回url编码过的数据
        '''
        salt = '2'  # 产生随机数 ,其实固定值也可以,不如"2"
        sign_str = self.appKey + queryText + salt + self.appSecret
        sign_str=sign_str.encode('utf-8')
        sign = hashlib.md5(sign_str).hexdigest()
        payload = {
            'q': queryText,
            'from': self.langFrom,
            'to': self.langTo,
            'appKey': self.appKey,
            'salt': salt,
            'sign': sign
        }

        # 注意是get请求,不是请求
        data = urllib.parse.urlencode(payload)
        return data

    def parseHtml(self, html):
        '''
        解析页面,输出翻译结果
        :param html: 翻译返回的页面内容
        :return: None
        '''
        data = json.loads(html)

        print ('-------------------------') 
        translationResult = data['translation']
        if isinstance(translationResult, list):
            translationResult = translationResult[0]
        print (translationResult)
        return translationResult

    def translate(self, queryText):
        data = self.getUrlEncodedData(queryText)  # 获取url编码过的数据
        target_url = self.url + '?' + data    # 构造目标url
        # request = urllib2.Request(target_url, headers=self.headers)  # 构造请求
        ip_list=get_ip_list()
        proxies=get_random_ip(ip_list)
        print('随机ip为:'+str(proxies))
        req = requests.get(target_url,proxies=proxies, headers=self.headers)  # 构造请求
        # with request.urlopen(request) as response111: # 发送请求
        req.encoding='utf-8'
        html=req.text
        translationResult=self.parseHtml(html)    # 解析,显示翻译结果
        return translationResult

#功能:读取文件并处理
def read_file(filepath):
    reader=[]
    with open(filepath,'r',encoding='utf-8') as csvfile:
        spanreader = csv.reader(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)
        for row in spanreader:
            if row:
                reader.append(row)
    return reader

#功能:将爬取到的内容写入文件
#注意事项:写文件时open中要加上newline='',否则写一行后程序会自动换行
def write_file(filepath,row):
	with open(filepath,'a+',encoding='utf-8',newline='') as csvfile:
		spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)
		spanreader.writerow(row)

#获取IP列表并检验IP的有效性
def get_ip_list():
    f=open('IP.txt','r')
    ip_list=f.readlines()
    f.close()
    return ip_list

#从IP列表中获取随机IP
def get_random_ip(ip_list):
    proxy_ip = random.choice(ip_list)
    proxy_ip=proxy_ip.strip('\n')
    proxies = {'http': proxy_ip}
    return proxies

if __name__ == "__main__":
    print('程序开始运行!')
    appKey = '应用id'  # 应用id
    appSecret = '应用密钥'  # 应用密钥
    fanyi = YouDaoFanyi(appKey, appSecret)
    reader=read_file('E_baiduBaike_notHaveChinese.csv')
    for row in reader:
        print('现在翻译的人名是:'+row[0])
        translationResult=fanyi.translate(row[0])
        print('翻译结果为:'+str(translationResult))
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
        if zhPattern.search(translationResult):
        	row[6]=translationResult
        write_file('经有道翻译处理后的文件/E_baiduBaike_youdaoChinese.csv',row)
    print('爬取完成')
    

你可能感兴趣的:(数据处理)