|
|
@@ -1,10 +1,16 @@
|
|
|
package com.miekir.shibei.tool.web;
|
|
|
|
|
|
+import com.miekir.shibei.bean.db.GoodsBean;
|
|
|
+import com.miekir.shibei.bean.db.MagnetBean;
|
|
|
+import com.miekir.shibei.controller.task.CrawlerService;
|
|
|
+import com.miekir.shibei.repository.GoodsRepository;
|
|
|
+import com.miekir.shibei.tool.TextUtils;
|
|
|
import com.sun.org.slf4j.internal.Logger;
|
|
|
import org.jsoup.Jsoup;
|
|
|
import org.jsoup.nodes.Document;
|
|
|
import org.jsoup.nodes.Element;
|
|
|
import org.jsoup.select.Elements;
|
|
|
+import org.springframework.beans.factory.annotation.Autowired;
|
|
|
|
|
|
import java.io.BufferedReader;
|
|
|
import java.io.InputStream;
|
|
|
@@ -12,12 +18,14 @@ import java.io.InputStreamReader;
|
|
|
import java.net.HttpURLConnection;
|
|
|
import java.net.ProtocolException;
|
|
|
import java.net.URL;
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.List;
|
|
|
import java.util.Random;
|
|
|
import java.util.logging.Level;
|
|
|
import java.util.zip.GZIPInputStream;
|
|
|
|
|
|
public class Sex8Tool {
|
|
|
- private static final String BASIC_URL = "http://sohumayun.space/";
|
|
|
+ public static final String BASIC_URL = "http://sohumayun.space/";
|
|
|
private static final String PAGE_URL_FORMAT = BASIC_URL + "forum-96-%s.html";
|
|
|
|
|
|
private Sex8Tool(){}
|
|
|
@@ -121,27 +129,31 @@ public class Sex8Tool {
|
|
|
* 开始爬目录
|
|
|
* @return
|
|
|
*/
|
|
|
- public static boolean startGettingData() {
|
|
|
+ public static boolean startGettingData(CrawlerService service) {
|
|
|
Document document;
|
|
|
try {
|
|
|
// 获取到当前页的内容
|
|
|
String pageUrl = String.format(PAGE_URL_FORMAT, mCurrentPage);
|
|
|
String content = getHtmlString(pageUrl);
|
|
|
+ // 如果这里解析很慢(总是走不到下一步),可能是jsoup jar包没有导入到tomcat的lib目录
|
|
|
document = Jsoup.parse(content);
|
|
|
if (document != null) {
|
|
|
// 解析当前页,获取每一项的详情页
|
|
|
- Elements itemElementList = document.getElementsByClass("icn");
|
|
|
- if (itemElementList != null && itemElementList.size() > 0) {
|
|
|
- for (Element element : itemElementList) {
|
|
|
- // 解析获取item对应的详情URL
|
|
|
- Elements tagAElementList = element.getElementsByTag("a");
|
|
|
- if (tagAElementList != null && tagAElementList.size() > 0) {
|
|
|
- String detailUrl = BASIC_URL + tagAElementList.get(0).attr("href");
|
|
|
- // 开始爬详情
|
|
|
- getDetailData(detailUrl);
|
|
|
+ String title = getTitle(document);
|
|
|
+ if (!TextUtils.isEmpty(title)) {
|
|
|
+ Elements itemElementList = document.getElementsByClass("icn");
|
|
|
+ if (itemElementList != null && itemElementList.size() > 0) {
|
|
|
+ for (Element element : itemElementList) {
|
|
|
+ // 解析获取item对应的详情URL
|
|
|
+ Elements tagAElementList = element.getElementsByTag("a");
|
|
|
+ if (tagAElementList != null && tagAElementList.size() > 0) {
|
|
|
+ String detailUrl = BASIC_URL + tagAElementList.get(0).attr("href");
|
|
|
+ // 开始爬详情
|
|
|
+ getDetailData(service, detailUrl, title);
|
|
|
+ }
|
|
|
+ // 爬完一个详情,歇一歇
|
|
|
+ justWait();
|
|
|
}
|
|
|
- // 爬完一个详情,歇一歇
|
|
|
- justWait();
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -154,11 +166,20 @@ public class Sex8Tool {
|
|
|
return mCurrentPage != 0;
|
|
|
}
|
|
|
|
|
|
+ private static String getTitle(Document document) {
|
|
|
+ Elements itemElementList = document.getElementsByClass("s xst");
|
|
|
+ if (itemElementList != null && itemElementList.size() > 0) {
|
|
|
+ return itemElementList.get(0).text();
|
|
|
+ }
|
|
|
+
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 开始爬详情
|
|
|
* @return
|
|
|
*/
|
|
|
- public static void getDetailData(String detailUrl) {
|
|
|
+ public static void getDetailData(CrawlerService service, String detailUrl, String title) {
|
|
|
Document document;
|
|
|
try {
|
|
|
// 获取到当前页的内容
|
|
|
@@ -169,14 +190,54 @@ public class Sex8Tool {
|
|
|
}
|
|
|
|
|
|
// 解析详情页
|
|
|
+ List<String> photoList = new ArrayList<String>();
|
|
|
Elements photoElementList = document.getElementsByClass("zoom");
|
|
|
if (photoElementList != null && photoElementList.size() > 0) {
|
|
|
for (Element element : photoElementList) {
|
|
|
String imageUrl = element.attr("file");
|
|
|
// 保存图片
|
|
|
- WebImageTool.autoSaveImage(imageUrl);
|
|
|
+ String photoUrl = WebImageTool.autoSaveImage(imageUrl);
|
|
|
+ if (!TextUtils.isEmpty(photoUrl)) {
|
|
|
+ photoList.add(photoUrl);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ List<MagnetBean> magnetBeanList = new ArrayList<MagnetBean>();
|
|
|
+ Elements magnetElementList = document.getElementsByClass("attnm");
|
|
|
+ if (magnetElementList != null && magnetElementList.size() > 0) {
|
|
|
+ Elements magnetInfoElementList = magnetElementList.get(0).getElementsByTag("a");
|
|
|
+ if (magnetInfoElementList != null && magnetInfoElementList.size() > 0) {
|
|
|
+ MagnetBean magnetBean = new MagnetBean();
|
|
|
+ magnetBean.text = magnetInfoElementList.get(0).text();
|
|
|
+ // todo 客户端查找的时候,返回去的要拼上BASIC_URL
|
|
|
+ magnetBean.url = magnetInfoElementList.get(0).attr("href");
|
|
|
+ magnetBeanList.add(magnetBean);
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+
|
|
|
+ String description = "暂无描述";
|
|
|
+ Elements descElementList = document.getElementsByClass("b_pr");
|
|
|
+ if (descElementList != null && descElementList.size() > 0) {
|
|
|
+ // todo 做正则剔除,或者客户端使用的时候,要使用WebView加载。先尝试WebView加载的
|
|
|
+ description = descElementList.get(0).parent().text();
|
|
|
+ }
|
|
|
+
|
|
|
+ // 保存到数据库
|
|
|
+ if (photoList.size() > 0 && magnetBeanList.size() > 0) {
|
|
|
+ GoodsBean goodsBean = new GoodsBean();
|
|
|
+ goodsBean.title = title;
|
|
|
+ goodsBean.description = description;
|
|
|
+ goodsBean.magnetBeanList = magnetBeanList;
|
|
|
+
|
|
|
+ goodsBean.coverImageUrl = photoList.get(0);
|
|
|
+ goodsBean.createTimeMillis = System.currentTimeMillis();
|
|
|
+ goodsBean.updateTimeMillis = System.currentTimeMillis();
|
|
|
+ goodsBean.enable = true;
|
|
|
+
|
|
|
+ service.saveGoods(goodsBean);
|
|
|
+ }
|
|
|
} catch (Exception e) {
|
|
|
e.printStackTrace();
|
|
|
}
|