|
@@ -3,14 +3,11 @@ package com.miekir.shibei.tool.web;
|
|
|
import com.miekir.shibei.bean.db.GoodsBean;
|
|
import com.miekir.shibei.bean.db.GoodsBean;
|
|
|
import com.miekir.shibei.bean.db.MagnetBean;
|
|
import com.miekir.shibei.bean.db.MagnetBean;
|
|
|
import com.miekir.shibei.controller.task.CrawlerService;
|
|
import com.miekir.shibei.controller.task.CrawlerService;
|
|
|
-import com.miekir.shibei.repository.GoodsRepository;
|
|
|
|
|
import com.miekir.shibei.tool.TextUtils;
|
|
import com.miekir.shibei.tool.TextUtils;
|
|
|
-import com.sun.org.slf4j.internal.Logger;
|
|
|
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.Jsoup;
|
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Document;
|
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.nodes.Element;
|
|
|
import org.jsoup.select.Elements;
|
|
import org.jsoup.select.Elements;
|
|
|
-import org.springframework.beans.factory.annotation.Autowired;
|
|
|
|
|
|
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.BufferedReader;
|
|
|
import java.io.InputStream;
|
|
import java.io.InputStream;
|
|
@@ -21,7 +18,6 @@ import java.net.URL;
|
|
|
import java.util.ArrayList;
|
|
import java.util.ArrayList;
|
|
|
import java.util.List;
|
|
import java.util.List;
|
|
|
import java.util.Random;
|
|
import java.util.Random;
|
|
|
-import java.util.logging.Level;
|
|
|
|
|
import java.util.zip.GZIPInputStream;
|
|
import java.util.zip.GZIPInputStream;
|
|
|
|
|
|
|
|
public class Sex8Tool {
|
|
public class Sex8Tool {
|
|
@@ -139,24 +135,37 @@ public class Sex8Tool {
|
|
|
document = Jsoup.parse(content);
|
|
document = Jsoup.parse(content);
|
|
|
if (document != null) {
|
|
if (document != null) {
|
|
|
// 解析当前页,获取每一项的详情页
|
|
// 解析当前页,获取每一项的详情页
|
|
|
- String title = getTitle(document);
|
|
|
|
|
- if (!TextUtils.isEmpty(title)) {
|
|
|
|
|
|
|
+ List<String> titleList = new ArrayList<String>();
|
|
|
|
|
+ getTitles(document, titleList);
|
|
|
|
|
+ if (titleList.size() > 0) {
|
|
|
Elements itemElementList = document.getElementsByClass("icn");
|
|
Elements itemElementList = document.getElementsByClass("icn");
|
|
|
if (itemElementList != null && itemElementList.size() > 0) {
|
|
if (itemElementList != null && itemElementList.size() > 0) {
|
|
|
|
|
+ int index = 0;
|
|
|
for (Element element : itemElementList) {
|
|
for (Element element : itemElementList) {
|
|
|
// 解析获取item对应的详情URL
|
|
// 解析获取item对应的详情URL
|
|
|
Elements tagAElementList = element.getElementsByTag("a");
|
|
Elements tagAElementList = element.getElementsByTag("a");
|
|
|
if (tagAElementList != null && tagAElementList.size() > 0) {
|
|
if (tagAElementList != null && tagAElementList.size() > 0) {
|
|
|
String detailUrl = BASIC_URL + tagAElementList.get(0).attr("href");
|
|
String detailUrl = BASIC_URL + tagAElementList.get(0).attr("href");
|
|
|
// 开始爬详情
|
|
// 开始爬详情
|
|
|
- getDetailData(service, detailUrl, title);
|
|
|
|
|
|
|
+ if (index < titleList.size()) {
|
|
|
|
|
+ getDetailData(service, detailUrl, titleList.get(index));
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
// 爬完一个详情,歇一歇
|
|
// 爬完一个详情,歇一歇
|
|
|
justWait();
|
|
justWait();
|
|
|
|
|
+ index++;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
- }
|
|
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
} catch (Exception e) {
|
|
} catch (Exception e) {
|
|
|
e.printStackTrace();
|
|
e.printStackTrace();
|
|
|
}
|
|
}
|
|
@@ -166,13 +175,14 @@ public class Sex8Tool {
|
|
|
return mCurrentPage != 0;
|
|
return mCurrentPage != 0;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- private static String getTitle(Document document) {
|
|
|
|
|
- Elements itemElementList = document.getElementsByClass("s xst");
|
|
|
|
|
|
|
+ private static void getTitles(Element docElement, List<String> titleList) {
|
|
|
|
|
+ Elements itemElementList = docElement.getElementsByClass("s xst");
|
|
|
if (itemElementList != null && itemElementList.size() > 0) {
|
|
if (itemElementList != null && itemElementList.size() > 0) {
|
|
|
- return itemElementList.get(0).text();
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ for (Element element : itemElementList) {
|
|
|
|
|
+ titleList.add(element.text());
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- return null;
|
|
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -203,6 +213,7 @@ public class Sex8Tool {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ GoodsBean goodsBean = new GoodsBean();
|
|
|
List<MagnetBean> magnetBeanList = new ArrayList<MagnetBean>();
|
|
List<MagnetBean> magnetBeanList = new ArrayList<MagnetBean>();
|
|
|
Elements magnetElementList = document.getElementsByClass("attnm");
|
|
Elements magnetElementList = document.getElementsByClass("attnm");
|
|
|
if (magnetElementList != null && magnetElementList.size() > 0) {
|
|
if (magnetElementList != null && magnetElementList.size() > 0) {
|
|
@@ -212,6 +223,7 @@ public class Sex8Tool {
|
|
|
magnetBean.text = magnetInfoElementList.get(0).text();
|
|
magnetBean.text = magnetInfoElementList.get(0).text();
|
|
|
// todo 客户端查找的时候,返回去的要拼上BASIC_URL
|
|
// todo 客户端查找的时候,返回去的要拼上BASIC_URL
|
|
|
magnetBean.url = magnetInfoElementList.get(0).attr("href");
|
|
magnetBean.url = magnetInfoElementList.get(0).attr("href");
|
|
|
|
|
+ magnetBean.goodsBean = goodsBean;
|
|
|
magnetBeanList.add(magnetBean);
|
|
magnetBeanList.add(magnetBean);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -226,7 +238,6 @@ public class Sex8Tool {
|
|
|
|
|
|
|
|
// 保存到数据库
|
|
// 保存到数据库
|
|
|
if (photoList.size() > 0 && magnetBeanList.size() > 0) {
|
|
if (photoList.size() > 0 && magnetBeanList.size() > 0) {
|
|
|
- GoodsBean goodsBean = new GoodsBean();
|
|
|
|
|
goodsBean.title = title;
|
|
goodsBean.title = title;
|
|
|
goodsBean.description = description;
|
|
goodsBean.description = description;
|
|
|
goodsBean.magnetBeanList.addAll(magnetBeanList);
|
|
goodsBean.magnetBeanList.addAll(magnetBeanList);
|