typecript实现简单爬虫

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档

文章目录

  • 前言
  • 一、初始爬虫代码
  • 二、爬虫代码改进(拆离网页解析功能)
  • 三、爬虫代码改进(融合单例模式)
  • 四、运行代码
  • 总结


前言

例如:随着人工智能的不断发展,机器学习这门技术也越来越重要,很多人都开启了学习机器学习,本文就介绍了机器学习的基础内容。


一、初始爬虫代码

crowller.ts

// lilied

// html获取模块
import superagent from 'superagent'

// html解析模块
import { load } from 'cheerio'
import fs from 'fs'
import path from 'path'

interface PersonInfo {
    img_url: string;
    name: string;
    link: string;
    store: string;
}

interface Result {
    [propName: string]: PersonInfo[]
}

class Crowller {
    fullPath = path.resolve(__dirname, '../data/data.json')

    constructor(private _url: string) {
        this.initSprderProcess()
    }

    async initSprderProcess() {
        const text = await this.getWebHtml(this._url)
        const personList = this.parseHtml(text)
        const resultStr = this.generateJsonContent(personList)
        this.storeData(resultStr)
    }

    // 获取网页HTML
    async getWebHtml(url: string) {
        const result = await superagent.get(url)
        return result.text
    }

    // 解析网页HTML
    parseHtml(text: string) {
        const personList: PersonInfo[] = []
        const $ = load(text)
        $('.rw_col').map((index, element) => {
            const img_url = $(element).find('div.rw_img > a > img').attr('src') || ''
            const name = $(element).find('div.rw_img > a > img').attr('alt') || ''
            const link = $(element).find('div.rw_img > a').attr('href') || ''
            const store = $(element).find('.dianji').text().split(':')[1] || ''
            personList.push({
                img_url,
                name,
                link,
                store
            })
        })
        return personList
    }

    // 生成json格式
    generateJsonContent(newData: PersonInfo[]):string{
        const fullPath = this.fullPath
        let result:Result = {}
        if(fs.existsSync(fullPath)){
            result = JSON.parse(fs.readFileSync(fullPath, 'utf8'))
        }
        result[new Date().getTime()] = newData
        return JSON.stringify(result)
    }

    // 存储数据
    storeData(data: string) {
        // fs.writeFileSync(fullPath, data)
        fs.writeFile(this.fullPath, data, (err) => {
            if(err){
                throw err
            }
            console.log('数据写入成功');
        })
    }
}


function main() {
    const URL = 'http://www.sextp.com/jpmt/'
    const c = new Crowller(URL)
}


main()

二、爬虫代码改进(拆离网页解析功能)

crowller.ts

// lilied

// html获取模块
import superagent from 'superagent'
import path from 'path'
import fs from 'fs'
import DellAnalyzer from './DellAnalyzer'

export interface Analyzer {
    analyze: (html: string, fullPath: string) => string
}

class Crowller {
    fullPath = path.resolve(__dirname, '../data/data.json')

    constructor(private _url: string, private _analyzer: Analyzer) {
        this.initSprderProcess()
    }

    async initSprderProcess() {
        const text = await this.getWebHtml(this._url)
        const resultStr = this._analyzer.analyze(text, this.fullPath)
        this.storeData(resultStr)
    }

    // 获取网页HTML
    async getWebHtml(url: string) {
        const result = await superagent.get(url)
        return result.text
    }

    // 存储数据
    storeData(data: string) {
        // fs.writeFileSync(fullPath, data)
        fs.writeFile(this.fullPath, data, (err) => {
            if (err) {
                throw err
            }
            console.log('数据写入成功');
        })
    }
}

function main() {
    const URL = 'http://www.sextp.com/jpmt/'
    const analyzer = new DellAnalyzer()
    new Crowller(URL, analyzer)
}

main()

DellAnalyzer.ts


// html解析模块
import { load } from 'cheerio'
import fs from 'fs'
import { Analyzer } from './crowller'

interface PersonInfo {
    img_url: string;
    name: string;
    link: string;
    store: string;
}

interface Result {
    [propName: string]: PersonInfo[]
}

export default class DellAnalyzer implements Analyzer {

    // 解析网页HTML
    private parseHtml(text: string) {
        const personList: PersonInfo[] = []
        const $ = load(text)
        $('.rw_col').map((index, element) => {
            const img_url = $(element).find('div.rw_img > a > img').attr('src') || ''
            const name = $(element).find('div.rw_img > a > img').attr('alt') || ''
            const link = $(element).find('div.rw_img > a').attr('href') || ''
            const store = $(element).find('.dianji').text().split(':')[1] || ''
            personList.push({
                img_url,
                name,
                link,
                store
            })
        })
        return personList
    }

    // 生成json格式
    private generateJsonContent(newData: PersonInfo[], fullPath:string):string{
        let result:Result = {}
        if(fs.existsSync(fullPath)){
            result = JSON.parse(fs.readFileSync(fullPath, 'utf8'))
        }
        result[new Date().getTime()] = newData
        return JSON.stringify(result)
    }

    public analyze(html: string, fullPath: string): string{
        const personList = this.parseHtml(html)
        const resultStr = this.generateJsonContent(personList, fullPath)
        return resultStr
    }

}

三、爬虫代码改进(融合单例模式)

crowller.ts

// lilied

// html获取模块
import superagent from 'superagent'
import path from 'path'
import fs from 'fs'
import DellAnalyzer from './DellAnalyzer'

export interface Analyzer {
    analyze: (html: string, fullPath: string) => string
}

class Crowller {
    fullPath = path.resolve(__dirname, '../data/data.json')

    constructor(private _url: string, private _analyzer: Analyzer) {
        this.initSprderProcess()
    }

    async initSprderProcess() {
        const text = await this.getWebHtml(this._url)
        const resultStr = this._analyzer.analyze(text, this.fullPath)
        this.storeData(resultStr)
    }

    // 获取网页HTML
    async getWebHtml(url: string) {
        const result = await superagent.get(url)
        return result.text
    }

    // 存储数据
    storeData(data: string) {
        // fs.writeFileSync(fullPath, data)
        fs.writeFile(this.fullPath, data, (err) => {
            if (err) {
                throw err
            }
            console.log('数据写入成功');
        })
    }
}

function main() {
    const URL = 'http://www.sextp.com/jpmt/'
    const analyzer = DellAnalyzer.getInstance()
    new Crowller(URL, analyzer)
}


main()

DellAnalyzer.ts


// html解析模块
import { load } from 'cheerio'
import fs from 'fs'
import { Analyzer } from './crowller'

interface PersonInfo {
    img_url: string;
    name: string;
    link: string;
    store: string;
}

interface Result {
    [propName: string]: PersonInfo[]
}

export default class DellAnalyzer implements Analyzer {
    // (单例)
    private static instance: DellAnalyzer

    // 私有化构造方法后,外部无法实例化(单例)
    private constructor(){}

    // 获取实例的静态方法(单例)
    public static getInstance(){
        if(!DellAnalyzer.instance){
            DellAnalyzer.instance = new DellAnalyzer()
        }
        return DellAnalyzer.instance
    }

    // 解析网页HTML
    private parseHtml(text: string) {
        const personList: PersonInfo[] = []
        const $ = load(text)
        $('.rw_col').map((index, element) => {
            const img_url = $(element).find('div.rw_img > a > img').attr('src') || ''
            const name = $(element).find('div.rw_img > a > img').attr('alt') || ''
            const link = $(element).find('div.rw_img > a').attr('href') || ''
            const store = $(element).find('.dianji').text().split(':')[1] || ''
            personList.push({
                img_url,
                name,
                link,
                store
            })
        })
        return personList
    }

    // 生成json格式
    private generateJsonContent(newData: PersonInfo[], fullPath:string):string{
        let result:Result = {}
        if(fs.existsSync(fullPath)){
            result = JSON.parse(fs.readFileSync(fullPath, 'utf8'))
        }
        result[new Date().getTime()] = newData
        return JSON.stringify(result)
    }

    public analyze(html: string, fullPath: string): string{
        const personList = this.parseHtml(html)
        const resultStr = this.generateJsonContent(personList, fullPath)
        return resultStr
    }

}

四、运行代码

(npm install -D ts-node 测试用)

  1. npm install -D nodemon 注意配置nodemonConfig
  2. npm install -D concurrently
  3. tsconfig.json 中 打开rootDir与outDir

package.json

{
  "name": "20221206",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "dev:build": "tsc -w",
    "dev:start": "nodemon node ./dist/crowller.js",
    "dev":"tsc && concurrently npm:dev:*"
  },
  "nodemonConfig": {
    "ignore":[
      "data/*"
    ]
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "devDependencies": {
    "concurrently": "^7.6.0",
    "nodemon": "^2.0.20",
    "ts-node": "^10.9.1",
    "typescript": "^4.9.3"
  },
  "dependencies": {
    "@types/superagent": "^4.1.16",
    "cheerio": "^1.0.0-rc.12",
    "superagent": "^8.0.5"
  }
}


总结

提示:这里对文章进行总结:
例如:以上就是今天要讲的内容,本文仅仅简单介绍了pandas的使用,而pandas提供了大量能使我们快速便捷地处理数据的函数和方法。

你可能感兴趣的:(typescript,爬虫,javascript,前端)