|
|
@@ -24,7 +24,8 @@ public class Sex8Tool {
|
|
|
public static final String BASIC_URL = "http://sohumayun.space/";
|
|
|
private static final String PAGE_URL_FORMAT = BASIC_URL + "forum-96-%s.html";
|
|
|
|
|
|
- private Sex8Tool(){}
|
|
|
+ private Sex8Tool() {
|
|
|
+ }
|
|
|
|
|
|
private static boolean mIsLoading;
|
|
|
// 从第二页开始爬
|
|
|
@@ -32,6 +33,7 @@ public class Sex8Tool {
|
|
|
|
|
|
/**
|
|
|
* 为了防止被百度拦截(人机识别),需要按F12查看浏览器的Header,设置相关属性
|
|
|
+ *
|
|
|
* @param connection
|
|
|
* @return
|
|
|
* @throws ProtocolException
|
|
|
@@ -63,6 +65,7 @@ public class Sex8Tool {
|
|
|
|
|
|
/**
|
|
|
* 获取网页源码
|
|
|
+ *
|
|
|
* @return
|
|
|
*/
|
|
|
private static String getHtmlString(String targetUrl) {
|
|
|
@@ -89,7 +92,7 @@ public class Sex8Tool {
|
|
|
}
|
|
|
|
|
|
return getHtmlString(url302);
|
|
|
- } else if (200 == connection.getResponseCode()) {
|
|
|
+ } else if (200 == connection.getResponseCode()) {
|
|
|
InputStream inputStream = null;
|
|
|
if (connection.getContentEncoding() != null && !connection.getContentEncoding().equals("")) {
|
|
|
String encode = connection.getContentEncoding().toLowerCase();
|
|
|
@@ -123,6 +126,7 @@ public class Sex8Tool {
|
|
|
|
|
|
/**
|
|
|
* 开始爬目录
|
|
|
+ *
|
|
|
* @return
|
|
|
*/
|
|
|
public static boolean startGettingData(CrawlerService service) {
|
|
|
@@ -135,36 +139,24 @@ public class Sex8Tool {
|
|
|
document = Jsoup.parse(content);
|
|
|
if (document != null) {
|
|
|
// 解析当前页,获取每一项的详情页
|
|
|
- List<String> titleList = new ArrayList<String>();
|
|
|
- getTitles(document, titleList);
|
|
|
- if (titleList.size() > 0) {
|
|
|
- Elements itemElementList = document.getElementsByClass("icn");
|
|
|
- if (itemElementList != null && itemElementList.size() > 0) {
|
|
|
- int index = 0;
|
|
|
- for (Element element : itemElementList) {
|
|
|
- // 解析获取item对应的详情URL
|
|
|
- Elements tagAElementList = element.getElementsByTag("a");
|
|
|
- if (tagAElementList != null && tagAElementList.size() > 0) {
|
|
|
- String detailUrl = BASIC_URL + tagAElementList.get(0).attr("href");
|
|
|
- // 开始爬详情
|
|
|
- if (index < titleList.size()) {
|
|
|
- getDetailData(service, detailUrl, titleList.get(index));
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // 爬完一个详情,歇一歇
|
|
|
- justWait();
|
|
|
- index++;
|
|
|
+ Elements itemElementList = document.getElementsByClass("icn");
|
|
|
+ if (itemElementList != null && itemElementList.size() > 0) {
|
|
|
+ int index = 0;
|
|
|
+ for (Element element : itemElementList) {
|
|
|
+ // 解析获取item对应的详情URL
|
|
|
+ Elements tagAElementList = element.getElementsByTag("a");
|
|
|
+ if (tagAElementList != null && tagAElementList.size() > 0) {
|
|
|
+ String detailUrl = BASIC_URL + tagAElementList.get(0).attr("href");
|
|
|
+ // 开始爬详情
|
|
|
+ getDetailData(service, detailUrl);
|
|
|
}
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
|
|
|
+ // 爬完一个详情,歇一歇
|
|
|
+ justWait();
|
|
|
+ index++;
|
|
|
+ }
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
e.printStackTrace();
|
|
|
@@ -175,21 +167,28 @@ public class Sex8Tool {
|
|
|
return mCurrentPage != 0;
|
|
|
}
|
|
|
|
|
|
- private static void getTitles(Element docElement, List<String> titleList) {
|
|
|
- Elements itemElementList = docElement.getElementsByClass("s xst");
|
|
|
+ private static String getTitles(Element docElement) {
|
|
|
+ Elements itemElementList = docElement.getElementsByTag("title");
|
|
|
if (itemElementList != null && itemElementList.size() > 0) {
|
|
|
- for (Element element : itemElementList) {
|
|
|
- titleList.add(element.text());
|
|
|
+ String titleExtra = itemElementList.get(0).text();
|
|
|
+ if (!TextUtils.isEmpty(titleExtra) && titleExtra.contains("杏吧")) {
|
|
|
+ int index = titleExtra.indexOf("杏吧");
|
|
|
+ if (index > 1) {
|
|
|
+ index = index - 1;
|
|
|
+ }
|
|
|
+ return titleExtra.substring(0, index);
|
|
|
}
|
|
|
-
|
|
|
}
|
|
|
+
|
|
|
+ return null;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 开始爬详情
|
|
|
+ *
|
|
|
* @return
|
|
|
*/
|
|
|
- public static void getDetailData(CrawlerService service, String detailUrl, String title) {
|
|
|
+ public static void getDetailData(CrawlerService service, String detailUrl) {
|
|
|
Document document;
|
|
|
try {
|
|
|
// 获取到当前页的内容
|
|
|
@@ -238,7 +237,7 @@ public class Sex8Tool {
|
|
|
|
|
|
// 保存到数据库
|
|
|
if (photoList.size() > 0 && magnetBeanList.size() > 0) {
|
|
|
- goodsBean.title = title;
|
|
|
+ goodsBean.title = getTitles(document);
|
|
|
goodsBean.description = description;
|
|
|
goodsBean.magnetBeanList.addAll(magnetBeanList);
|
|
|
goodsBean.contentImageUrlList.addAll(photoList);
|