[功能介紹]nodejs爬蟲怎么設(shè)置ip代理?
我們?cè)谶M(jìn)行網(wǎng)站爬蟲的時(shí)候,會(huì)比較常碰上IP被封殺的情況,如果IP在同一時(shí)間經(jīng)常訪問(wèn),那么我們的IP掛掉的概率就非常高。所以我們需要設(shè)置動(dòng)態(tài)ip代理來(lái)解除。
IP海帶來(lái)的教程介紹:
動(dòng)態(tài)IP:
設(shè)置動(dòng)態(tài)IP需要用到一個(gè)superagent插件—superagent-proxy,除此之外為了避免每次爬取時(shí)都去獲取一次動(dòng)態(tài)IP的列表,將爬取到的動(dòng)態(tài)IP列表存放在redis中,并設(shè)置10分鐘的過(guò)期時(shí)間。數(shù)據(jù)過(guò)期之后再重新發(fā)送獲取動(dòng)態(tài)IP的請(qǐng)求。
package.json
{
"name": "xxx",
"version": "1.0.0",
"description": "xxx",
"main": "arf.js",
"scripts": {
"arf": "nodemon src/app.js --exec babel-node --config package.json"
},
"keywords": [
"爬蟲"
],
"author": "lidikang",
"license": "MIT",
"dependencies": {
"bluebird": "^3.5.1",
"cheerio": "^1.0.0-rc.2",
"eventproxy": "^1.0.0",
"mongoose": "^4.13.6",
"mongoose-findorcreate": "^2.0.0",
"progress": "^2.0.0",
"redis": "^2.8.0",
"superagent": "^3.8.1",
"superagent-proxy": "^1.0.2"
},
"devDependencies": {
"babel-cli": "^6.26.0",
"babel-preset-es2015": "^6.24.1",
"babel-preset-stage-2": "^6.24.1",
"nodemon": "^1.12.4"
},
"nodemonConfig": {
"ignore": [
"ips.json",
"docs/*"
],
"delay": "2500"
}
}
app.js
import request from 'superagent'
import requestProxy from 'superagent-proxy'
import redis from 'redis'
// superagent添加使用ip代理的插件
requestProxy(request)
// redis promise化
bluebird.promisifyAll(redis.RedisClient.prototype)
bluebird.promisifyAll(redis.Multi.prototype)
// 建立mongoose和redis連接
const redisClient = connectRedis()
/**
* 初始化redis
*/
function connectRedis() {
let client = redis.createClient(config.REDIS_URL)
client.on("ready", function(err) {
console.log('redis連接 √')
})
client.on("error", function(err) {
console.log(`redis錯(cuò)誤,${err} ×`);
})
return client
}
/**
* 請(qǐng)求免費(fèi)代理,讀取redis,如果代理信息已經(jīng)過(guò)期,重新請(qǐng)求免費(fèi)代理請(qǐng)求
*/
async function getProxyIp() {
// 先從redis讀取緩存ip
let localIpStr = await redisClient.getAsync('proxy_ips')
let ips = null
// 如果本地存在,則隨機(jī)返回其中一個(gè)ip,否則重新請(qǐng)求
if (localIpStr) {
let localIps = localIpStr.split(',')
return localIps[parseInt(Math.random() * localIps.length)]
} else {
let ipsJson = (await request.get('http://api.pcdaili.com/?orderid=888888888&num=100&protocol=1&method=1&an_ha=1&sp1=1&sp2=1&format=json&sep=1')).body
let isRequestSuccess = false
if (ipsJson && ipsJson.data.proxy_list) {
ips = ipsJson.data.proxy_list
isRequestSuccess = true
} else {
ips = ['http://127.0.0.1']
}
// 將爬取結(jié)果存入本地,緩存時(shí)間10分鐘
if (isRequestSuccess) {
redisClient.set("proxy_ips", ips.join(','), 'EX', 10 * 60)
}
return ips[parseInt(Math.random() * ips.length)]
}
}
async function doRequest(){
let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]
let ip = await getProxyIp()
let useIp = 'http://' + ip
request.get('http://www.xxx.com')
.set({ 'User-Agent': userAgent })
.timeout({ response: 5000, deadline: 60000 })
.proxy(ip)
.end(async(err, res) => {
// 處理數(shù)據(jù)
})
}
通過(guò)上面的步驟來(lái)操作,我們就可以完成動(dòng)態(tài)ip代理的設(shè)置了,可以開始使用了。
版權(quán)聲明:本文為IP海(iphai.cn)原創(chuàng)作品,未經(jīng)許可,禁止轉(zhuǎn)載!
Copyright © www.wibm.ac.cn. All Rights Reserved. IP海 版權(quán)所有.
IP海僅提供中國(guó)內(nèi)IP加速服務(wù),無(wú)法跨境聯(lián)網(wǎng),用戶應(yīng)遵守《服務(wù)條款》內(nèi)容,嚴(yán)禁用戶使用IP海從事任何違法犯罪行為。
鄂ICP備19030659號(hào)-3
鄂公網(wǎng)安備42100302000141號(hào)
計(jì)算機(jī)軟件著作權(quán)證
ICP/EDI許可證:鄂B2-20200106