使用superagent获取豆瓣信息
2022.06.29 Wed
想到在博客上插入一个豆瓣卡片以显示豆瓣主页信息,记得在 Hexo 插件库里面有这种功能的插件,但仔细看了看发现它们又太复杂,于是对照着一个插件的源码自己试着写了一下。
我的目的是想获取我的豆瓣上面书影音数量,使用 superagent 发起网络请求,cheerio 解析页面获取数据,这两个包我之前也没有了解过,于是在这里记录一下。
BaseSpider.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14const superagent = require('superagent');
const cheerio = require('cheerio');
class BaseSpider {
constructor(id) {
this.superagent = superagent;
this.cheerio = cheerio;
this.DOUBAN_PAGE = `douban.com/people/${id}`;
}
crawl() {}
parsePlainText(plainText) {}
}
module.exports.BaseSpider = BaseSpider;DoubanSpider.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57const { BaseSpider } = require('./baseSpider');
class IndexSpider extends BaseSpider {
constructor(id) {
super(id);
}
crawl() {
return new Promise((resolve, reject) => {
this.superagent
.get(this.DOUBAN_PAGE)
.set({
'User-Agent':
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
})
.then((res) => {
const indexInfo = this.parsePlainText(res.text);
resolve(indexInfo);
})
.catch((err) => {
reject(err);
});
});
}
parsePlainText(plainText) {
const $ = this.cheerio.load(plainText);
const content = {
movies: ['在看', '看过', '想看'],
books: ['在读', '读过', '想读'],
};
let info = {
movies: {},
books: {},
};
const movies = $('#content #movie').find('.pl a').toArray();
const books = $('#content #book').find('.pl a').toArray();
Object.keys(content).map((key) => {
if (key === 'movies') {
movies.forEach((item) => {
const text = $(item).text();
content[key].map((cate) => {
if (text.indexOf(cate) !== -1) info[key][cate] = text.replace(/\D/g, '');
});
});
}
if (key === 'books') {
books.forEach((item) => {
const text = $(item).text();
content[key].map((cate) => {
if (text.indexOf(cate) !== -1) info[key][cate] = text.replace(/\D/g, '');
});
});
}
});
return info;
}
}
module.exports.IndexSpider = IndexSpider;index.js
1
2
3
4
5
6
7
8
9
10
11
12const { IndexSpider } = require('./indexSpider');
const id = 161208120;
const doubanSpider = new IndexSpider(id);
doubanSpider
.crawl()
.then((res) => {
console.log(res);
})
.catch((err) => {
console.log(err);
});最终输出结果
1
2
3
4info:{
movies: { '在看': '5', '已看': '871', '想看': '268' },
books: { '在读': '3', '已读': '89', '想读': '34' }
}暂时还没有想到怎么合理地嵌到我的博客里面,先做出这个 demo,待日后再作研究。