parent
fba7ae528e
commit
899de00d9f
|
|
@ -1,9 +1,9 @@
|
||||||
package com.xjs.consts;
|
package com.xjs.consts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* api预警处理常量
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
* @desc api预警处理常量
|
* @since 2022-01-07
|
||||||
* @create 2022-01-07
|
|
||||||
*/
|
*/
|
||||||
public class ApiWarnHandleConst {
|
public class ApiWarnHandleConst {
|
||||||
//已处理
|
//已处理
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
package com.xjs.consts;
|
package com.xjs.consts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* 各个数据平台常量类
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
* @desc 各个数据平台常量类
|
* @since 2021-12-28
|
||||||
* @create 2021-12-28
|
|
||||||
*/
|
*/
|
||||||
public class CopyWritingConst {
|
public class CopyWritingConst {
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
package com.xjs.consts;
|
package com.xjs.consts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* 英语单词常量类
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
* @desc 英语单词常量类
|
* @since 2021-12-31
|
||||||
* @create 2021-12-31
|
|
||||||
*/
|
*/
|
||||||
public class EnglishWordConst {
|
public class EnglishWordConst {
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,4 +27,9 @@ public class RegexConst {
|
||||||
* ip地址v4、v6正则
|
* ip地址v4、v6正则
|
||||||
*/
|
*/
|
||||||
public static final String IP_REGEX ="^((2[0-4]\\d|25[0-5]|[01]?\\d\\d?)\\.){3}(2[0-4]\\d|25[0-5]|[01]?\\d\\d?)$";
|
public static final String IP_REGEX ="^((2[0-4]\\d|25[0-5]|[01]?\\d\\d?)\\.){3}(2[0-4]\\d|25[0-5]|[01]?\\d\\d?)$";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 数字校验正则
|
||||||
|
*/
|
||||||
|
public static final String NUMBER_REGEX= "[0-9]*";
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
package com.xjs.consts;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 爬虫网址常量类
|
||||||
|
* @author xiejs
|
||||||
|
* @since 2022-02-16
|
||||||
|
*/
|
||||||
|
public class ReptileUrlConst {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 新浪新闻网站
|
||||||
|
*/
|
||||||
|
public static final String SINA_NEWS_URL = "https://news.sina.com.cn/";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 文案网网址
|
||||||
|
*/
|
||||||
|
public static final String COPY_WRITING_NETWORK_URL = "https://www.wenanwang.com/";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
package com.xjs.consts;
|
package com.xjs.consts;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* 请求是否成功常量
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
* @desc 请求是否成功常量
|
* @since 2021-12-26
|
||||||
* @create 2021-12-26
|
|
||||||
*/
|
*/
|
||||||
public class ReqConst {
|
public class ReqConst {
|
||||||
public static final Integer SUCCESS = 1;
|
public static final Integer SUCCESS = 1;
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,9 @@ import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import static com.xjs.consts.RegexConst.NUMBER_REGEX;
|
||||||
|
import static com.xjs.consts.ReptileUrlConst.COPY_WRITING_NETWORK_URL;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 文案网爬虫任务 url:https://www.wenanwang.com/
|
* 文案网爬虫任务 url:https://www.wenanwang.com/
|
||||||
*
|
*
|
||||||
|
|
@ -34,23 +37,21 @@ public class CopyWritingNetworkTask {
|
||||||
private CopyWritingNetworkService copyWritingNetworkService;
|
private CopyWritingNetworkService copyWritingNetworkService;
|
||||||
|
|
||||||
|
|
||||||
public static final String URL = "https://www.wenanwang.com/";
|
private static final Pattern pattern = Pattern.compile(NUMBER_REGEX);
|
||||||
|
|
||||||
private static Pattern pattern = Pattern.compile("[0-9]*");
|
@Scheduled(fixedDelay = 1000 * 5 * 60 * 10)
|
||||||
|
|
||||||
@Scheduled(fixedDelay = 1000 * 5)
|
|
||||||
public void reptileCopyWriting() {
|
public void reptileCopyWriting() {
|
||||||
try {
|
try {
|
||||||
String html = httpUtils.doGetHtml(URL);
|
String html = httpUtils.doGetHtml(COPY_WRITING_NETWORK_URL);
|
||||||
|
|
||||||
Document document = Jsoup.parse(html);
|
Document document = Jsoup.parse(html);
|
||||||
|
|
||||||
this.parseHtmlGetUrl(document);
|
this.parseHtmlGetUrl(document);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}finally {
|
} finally {
|
||||||
int i = copyWritingNetworkService.deleteRepeatData();
|
int i = copyWritingNetworkService.deleteRepeatData();
|
||||||
log.info("删除文案网数据重复数:"+i);
|
log.info("删除文案网数据重复数:" + i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -92,7 +93,7 @@ public class CopyWritingNetworkTask {
|
||||||
for (Element element : a) {
|
for (Element element : a) {
|
||||||
|
|
||||||
String href = element.attr("href");
|
String href = element.attr("href");
|
||||||
String newUrl = URL + href;
|
String newUrl = COPY_WRITING_NETWORK_URL + href;
|
||||||
|
|
||||||
String cw = httpUtils.doGetHtml(newUrl);
|
String cw = httpUtils.doGetHtml(newUrl);
|
||||||
Document cwDocument = Jsoup.parse(cw);
|
Document cwDocument = Jsoup.parse(cw);
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,8 @@ import org.springframework.stereotype.Component;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static com.xjs.consts.ReptileUrlConst.SINA_NEWS_URL;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新浪新闻爬虫任务
|
* 新浪新闻爬虫任务
|
||||||
* @author xiejs
|
* @author xiejs
|
||||||
|
|
@ -30,11 +32,12 @@ public class SinaNewsTask {
|
||||||
@Autowired
|
@Autowired
|
||||||
private SinaNewsService sinaNewsService;
|
private SinaNewsService sinaNewsService;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void reptileSinaNews() {
|
public void reptileSinaNews() {
|
||||||
try {
|
try {
|
||||||
String url = "https://news.sina.com.cn/";
|
|
||||||
|
|
||||||
String html = httpUtils.doGetHtml(url);
|
String html = httpUtils.doGetHtml(SINA_NEWS_URL);
|
||||||
|
|
||||||
Document document = Jsoup.parse(html);
|
Document document = Jsoup.parse(html);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue