【WebMagic】抓取前端渲染的页面

随着AJAX技术不断的普及,以及现在AngularJS这种Single-page application框架的出现,现在js渲染出的页面越来越多。对于爬虫来说,这种页面是比较讨厌的:仅仅提取HTML内容,往往无法拿到有效的信息。那么如何处理这种页面呢?总的来说有两种做法:

  1. 在抓取阶段,在爬虫中内置一个浏览器内核,执行js渲染页面后,再抓取。这方面对应的工具有SeleniumHtmlUnit或者PhantomJs。但是这些工具都存在一定的效率问题,同时也不是那么稳定。好处是编写规则同静态页面一样。
  2. 因为js渲染页面的数据也是从后端拿到,而且基本上都是AJAX获取,所以分析AJAX请求,找到对应数据的请求,也是比较可行的做法。而且相对于页面样式,这种接口变化可能性更小。缺点就是找到这个请求,并进行模拟,是一个相对困难的过程,也需要相对多的分析经验。

方法一是通过执行js渲染页面获取静态页面来抓取数据,而方法二则是找规律找到目标数据的请求URL来获取数据。
本次着重讲解方法一,这种方式相对来说较为简单。

环境安装

  1. 谷歌浏览器+驱动
    下载谷歌浏览器,然后下载该浏览器对应版本的驱动。
    查看谷歌浏览器版本:
    chrome://settings/help
    驱动下载地址:
    源地址:http://chromedriver.storage.googleapis.com/index.html
    国内镜像:http://npm.taobao.org/mirrors/chromedriver/

项目搭建

我使用的是SpringBoot框架,项目的目录结构如下:
crawler.png
各源文件如下:
pom.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.4.0</version>
</parent>

<groupId>com.born2do</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<webmagic.version>0.7.4</webmagic.version>
</properties>

<dependencies>
<!-- https://mvnrepository.com/artifact/org.springframework.boot/spring-boot-starter-web -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.springframework.boot/spring-boot-starter-data-jpa -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.springframework.boot/spring-boot-starter-test -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${webmagic.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${webmagic.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>30.0-jre</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.kotcrab.remark/remark -->
<dependency>
<groupId>com.kotcrab.remark</groupId>
<artifactId>remark</artifactId>
<version>1.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.33.0</version>
</dependency>
</dependencies>

</project>

CrawlerOnSpringBoot2Doc.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package com.born2do.task;

import com.born2do.webmagic.downloader.selenium.SeleniumDownloader;
import com.overzealous.remark.Remark;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

/**
* 获取《SpringBoot2核心技术与响应式编程》教学文档
* 说明:“语雀”前端页面较为复杂,数据有多层加密,且主要由Ajax渲染页面数据,在这里并没有对代码块以及思维导图进行进一步处理,需人工处理
*
* @author chenhy
* @date 2021/3/22
*/
@Component
public class CrawlerOnSpringBoot2Doc implements PageProcessor {

private static final String website = "https://www.yuque.com/atguigu/springboot";

private static final String file = "D:\\SpringBoot2核心技术与响应式编程.md";

private Site site = Site.me().setCharset("UTF8") // 编码格式
.setTimeOut(1000 * 30) // 超时时间
.setRetrySleepTime(1000 * 5) // 重试时间间隔
.setRetryTimes(10); // 重试次数

@Override
public void process(Page page) {
// 获取所有目录的url,并加入爬虫队列中
if (website.equals(page.getUrl().toString())) {
List<String> urls = page.getHtml().xpath("//span[@class='name']").links().all();
for (String url : urls) {
page.addTargetRequest(url);
}
} else {
// 获取内容
String content = page.getHtml().xpath("//div[@class='index-module_title_1s0gC']").toString()
+ page.getHtml().xpath("//div[@class='yuque-doc-content']").toString();
// 将获取到的内容从HTML格式转换为Markdown格式
Remark remark = new Remark();
content = remark.convert(content);
page.putField("content", content);
System.out.println(page.getUrl() + " download over!");
}
}

@Override
public Site getSite() {
return site;
}

@Scheduled(fixedDelay = 1000 * 60 * 60 * 24)
private void mainProcess() throws IOException {
Path filePath = Paths.get(file);
if (Files.exists(filePath)) {
Files.delete(filePath);
}
Files.createFile(filePath);
Spider.create(new CrawlerOnSpringBoot2Doc())
.addUrl(website)
.setDownloader(new SeleniumDownloader("D:\\chromedriver_win32\\chromedriver.exe"))
.thread(1)
.addPipeline(new MarkdownPipeLine(filePath))
.run();
System.out.println("process is over!");
}
}

MarkdownPipeLine.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
package com.born2do.task;

import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Map;

/**
* @author chenhy
* @date 2021/3/21
*/
public class MarkdownPipeLine implements Pipeline {

private Path filePath;

public MarkdownPipeLine(Path filePath) {
this.filePath = filePath;
}

@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> result = resultItems.getAll();
String content = (String) result.get("content");
try {
Files.write(filePath, content.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);
// 换行
Files.write(filePath, System.getProperty("line.separator").getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);
} catch (IOException e) {
e.printStackTrace();
}
}
}

SeleniumDownloader.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package com.born2do.webmagic.downloader.selenium;

import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;

import java.io.Closeable;
import java.io.IOException;
import java.util.Map;

/**
* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
* 需要下载Selenium driver支持。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:37 <br>
*/
public class SeleniumDownloader implements Downloader, Closeable {

private volatile WebDriverPool webDriverPool;

private Logger logger = LoggerFactory.getLogger(getClass());

private int sleepTime = 0;

private int poolSize = 1;

private static final String DRIVER_PHANTOMJS = "phantomjs";

/**
* 新建
*
* @param chromeDriverPath chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath);
}

/**
* Constructor without any filed. Construct PhantomJS browser
*
* @author bob.li.0718@gmail.com
*/
public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}

/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}

@Override
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}

/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/

WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
return page;
}

private void checkInit() {
if (webDriverPool == null) {
synchronized (this) {
webDriverPool = new WebDriverPool(poolSize);
}
}
}

@Override
public void setThread(int thread) {
this.poolSize = thread;
}

@Override
public void close() throws IOException {
webDriverPool.closeAll();
}
}

WebDriverPool.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
package com.born2do.webmagic.downloader.selenium;

import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;

/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 下午1:41 <br>
*/
class WebDriverPool {
private Logger logger = LoggerFactory.getLogger(getClass());

private final static int DEFAULT_CAPACITY = 5;

private final int capacity;

private final static int STAT_RUNNING = 1;

private final static int STAT_CLODED = 2;

private AtomicInteger stat = new AtomicInteger(STAT_RUNNING);

private WebDriver mDriver = null;

private static final String DEFAULT_CONFIG_FILE = "/config.ini";
private static final String DRIVER_FIREFOX = "firefox";
private static final String DRIVER_CHROME = "chrome";

protected static Properties sConfig;
protected static DesiredCapabilities sCaps;

/**
* Configure the GhostDriver, and initialize a WebDriver instance. This part
* of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
*
* @author bob.li.0718@gmail.com
* @throws IOException
*/
public void configure() throws IOException {
// Read config file
sConfig = new Properties();
// String configFile = DEFAULT_CONFIG_FILE;
String configFile = this.getClass().getResource(DEFAULT_CONFIG_FILE).getPath();
if (System.getProperty("selenuim_config")!=null){
configFile = System.getProperty("selenuim_config");
}
sConfig.load(new FileReader(configFile));

// Prepare capabilities
sCaps = new DesiredCapabilities();
sCaps.setJavascriptEnabled(true);
sCaps.setCapability("takesScreenshot", false);

String driver = sConfig.getProperty("driver", DRIVER_CHROME);

ArrayList<String> cliArgsCap = new ArrayList<String>();
cliArgsCap.add("--web-security=false");
cliArgsCap.add("--ssl-protocol=any");
cliArgsCap.add("--ignore-ssl-errors=true");

// Start appropriate Driver
if (driver.equals(DRIVER_FIREFOX)) {
mDriver = new FirefoxDriver(sCaps);
} else if (driver.equals(DRIVER_CHROME)) {
mDriver = new ChromeDriver(sCaps);
}
}

/**
* check whether input is a valid URL
*
* @author bob.li.0718@gmail.com
* @param urlString urlString
* @return true means yes, otherwise no.
*/
private boolean isUrl(String urlString) {
try {
new URL(urlString);
return true;
} catch (MalformedURLException mue) {
return false;
}
}

/**
* store webDrivers created
*/
private List<WebDriver> webDriverList = Collections
.synchronizedList(new ArrayList<WebDriver>());

/**
* store webDrivers available
*/
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>();

public WebDriverPool(int capacity) {
this.capacity = capacity;
}

public WebDriverPool() {
this(DEFAULT_CAPACITY);
}

/**
*
* @return
* @throws InterruptedException
*/
public WebDriver get() throws InterruptedException {
checkRunning();
WebDriver poll = innerQueue.poll();
if (poll != null) {
return poll;
}
if (webDriverList.size() < capacity) {
synchronized (webDriverList) {
if (webDriverList.size() < capacity) {

// add new WebDriver instance into pool
try {
configure();
innerQueue.add(mDriver);
webDriverList.add(mDriver);
} catch (IOException e) {
e.printStackTrace();
}
}
}

}
return innerQueue.take();
}

public void returnToPool(WebDriver webDriver) {
checkRunning();
innerQueue.add(webDriver);
}

protected void checkRunning() {
if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
throw new IllegalStateException("Already closed!");
}
}

public void closeAll() {
boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED);
if (!b) {
throw new IllegalStateException("Already closed!");
}
for (WebDriver webDriver : webDriverList) {
logger.info("Quit webDriver" + webDriver);
webDriver.quit();
webDriver = null;
}
}

}

config.ini

1
2
3
4
5
6
7
# What WebDriver to use for the tests
#driver=firefox
driver=chrome

#谷歌浏览器启动程序路径
#chrome_exec_path=D:\chromedriver_win32\chromedriver.exe
chrome_driver_loglevel=DEBUG

项目地址: https://github.com/born2do/crawler.git

说明

Selenium 已不再支持 PhantomJS,以前还可以通过降低jar包版本的方式解决该问题,但是现在已经不行了,即使版本再低,也不会引入PhantomJS相关的jar包了,所以webmagic作者开发的webmagic-selenium已经不能再用了(好像已经很久都没有更新版本了。。。),我这里的处理是下载源码,在源码的基础上将PhantomJS的相关内容都已经剔除了。