只有java语法基础,项目用Springboot做的,怎么入门,学习路线怎么安排?
关注者
282被浏览
370,972登录后你可以
不限量看优质回答私信答主深度交流精彩内容一键收藏
爬虫技术日益盛行,爬虫几乎是python等脚本语言,但是现在大部分公司都是使用Java作为后台开发语言。在有爬虫需求的时候我们难免会有些慌乱。对于一个致力于解脱同行且负责任的[Duck TS](Duck TS)平台,这篇文章来解决你的难题。
1. 引入依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
2. 工具类
import lombok.SneakyThrows;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
import org.springframework.http.converter.HttpMessageConverter;
import org.springframework.http.converter.StringHttpMessageConverter;
import org.springframework.web.client.RestTemplate;
import javax.net.ssl.SSLContext;
import java.nio.charset.Charset;
import java.security.cert.X509Certificate;
import java.util.List;
public class RestTemplateUtil {
@SneakyThrows
public static RestTemplate getInstance(String charset) {
TrustStrategy acceptingTrustStrategy = (X509Certificate[] chain, String authType) -> true;
//忽略证书
SSLContext sslContext = org.apache.http.ssl.SSLContexts.custom()
.loadTrustMaterial(null, acceptingTrustStrategy)
.build();
SSLConnectionSocketFactory csf = new SSLConnectionSocketFactory(sslContext);
Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.getSocketFactory())
.register("https", csf)
.build();
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(registry);
//连接池的最大连接数,0代表不限;如果取0,需要考虑连接泄露导致系统崩溃的后果
connectionManager.setMaxTotal(1000);
//每个路由的最大连接数,如果只调用一个地址,可以将其设置为最大连接数
connectionManager.setDefaultMaxPerRoute(300);
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionManager(connectionManager)
.build();
HttpComponentsClientHttpRequestFactory requestFactory =
new HttpComponentsClientHttpRequestFactory();
requestFactory.setHttpClient(httpClient);
requestFactory.setConnectionRequestTimeout(3000);
requestFactory.setConnectTimeout(3000);
requestFactory.setReadTimeout(10000);
RestTemplate restTemplate = new RestTemplate(requestFactory);
List<HttpMessageConverter<?>> list = restTemplate.getMessageConverters();
for (HttpMessageConverter<?> httpMessageConverter : list) {
if(httpMessageConverter instanceof StringHttpMessageConverter) {
((StringHttpMessageConverter) httpMessageConverter).setDefaultCharset(Charset.forName(charset));
break;
}
}
return restTemplate;
}
}
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.client.RestTemplate;
/**
* @author Administrator
*/
public class HttpUtil {
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
public static String getByHttpClient(String url) {
try {
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
} else {
return null;
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
3. 使用方式
String contentHtml = HttpUtil.getByHttpClient(htmlUrl);
然后用用Java正则表达式取出需要的东西即可。
参考文章: