Ver código fonte

爬虫已就绪

詹子聪 5 anos atrás
pai
commit
a9572af62a

+ 47 - 56
src/main/java/com/miekir/shibei/bean/db/GoodsBean.java

@@ -1,6 +1,7 @@
 package com.miekir.shibei.bean.db;
 
 import javax.persistence.*;
+import java.util.List;
 
 /**
  *
@@ -22,7 +23,6 @@ public class GoodsBean {
     @Column(name = "id", nullable = false, insertable = true, updatable = false)
     public long id;
 
-
     /**
      * 封面图片地址
      */
@@ -47,94 +47,85 @@ public class GoodsBean {
      * 商品描述
      */
     @Basic
-    @Column(name = "description", nullable = true, insertable = true, updatable = true)
+    @Column(columnDefinition = "MEDIUMTEXT", name = "description", nullable = true, insertable = true, updatable = true)
     public String description;
+
     /**
-     * 商品推荐理由(一句话推荐、推荐者心声)
-     */
-    @Basic
-    @Column(name = "reason", nullable = true, insertable = true, updatable = true)
-    public String reason;
-    /**
-     * 原价
-     */
-    @Basic
-    @Column(name = "oldPrice", nullable = true, insertable = true, updatable = true)
-    public long oldPrice;
-    /**
-     * 现价
-     */
-    @Basic
-    @Column(name = "nowPrice", nullable = true, insertable = true, updatable = true)
-    public long nowPrice;
-    /**
-     * 返利
+     * 主演
      */
     @Basic
-    @Column(name = "rebate", nullable = true, insertable = true, updatable = true)
-    public long rebate;
+    @Column(name = "actors", nullable = true, insertable = true, updatable = true)
+    public String actors;
+
     /**
-     * 店名
+     * 时长-1表示未知
      */
     @Basic
-    @Column(name = "shopName", nullable = true, insertable = true, updatable = true)
-    public String shopName;
+    @Column(name = "timeLong", nullable = true, insertable = true, updatable = true)
+    public String timeLong = "未知";
+
     /**
-     * 商店所属省份
+     * 视频尺寸
      */
     @Basic
-    @Column(name = "province", nullable = true, insertable = true, updatable = true)
-    public String province;
+    @Column(name = "videoSize", nullable = true, insertable = true, updatable = true)
+    public String videoSize;
+
     /**
-     * 是否自营
+     * 是否有马
      */
     @Basic
-    @Column(name = "isSelfBusiness", nullable = true, insertable = true, updatable = true)
-    public boolean isSelfBusiness;
+    @Column(name = "hasMosaic", nullable = true, insertable = true, updatable = true)
+    public String hasMosaic;
+
     /**
-     * 是否有券
+     * 赞的数量
      */
     @Basic
-    @Column(name = "hasCoupon", nullable = true, insertable = true, updatable = true)
-    public boolean hasCoupon;
+    @Column(name = "dingNum", nullable = true, insertable = true, updatable = true)
+    public int dingNum;
+
     /**
-     * 优惠券信息
+     * 踩的数量
      */
     @Basic
-    @Column(name = "couponInfo", nullable = true, insertable = true, updatable = true)
-    public String couponInfo;
+    @Column(name = "caiNum", nullable = true, insertable = true, updatable = true)
+    public int caiNum;
+
     /**
-     * 所属类型
+     * 好评率 = 赞/(赞+踩)
      */
     @Basic
-    @Column(name = "goodsType", nullable = true, insertable = true, updatable = true)
-    public int goodsType;
+    @Column(name = "goodComment", nullable = true, insertable = true, updatable = true)
+    public double goodComment;
+
     /**
-     * 商品链接
+     * 烦号信息
      */
-    @Basic
-    @Column(columnDefinition = "MEDIUMTEXT", name = "goodsUrl", nullable = true, insertable = true, updatable = true)
-    public String goodsUrl;
+    @ElementCollection
+    @Column(columnDefinition = "MEDIUMTEXT", name = "fanList", nullable = true, insertable = true, updatable = true)
+    public List<String> fanList;
+
     /**
-     * 月销量
+     * 磁力链接合集
      */
-    @Basic
-    @Column(name = "salesPerMonth", nullable = true, insertable = true, updatable = true)
-    public long salesPerMonth;
+    @ElementCollection
+    @Column(columnDefinition = "MEDIUMTEXT", name = "magnetBeanList", nullable = true, insertable = true, updatable = true)
+    public List<MagnetBean> magnetBeanList;
 
     /**
-     * 评论条数
+     * 是否已收藏
      */
     @Basic
-    @Column(name = "commentNum", nullable = true, insertable = true, updatable = true)
-    public long commentNum;
+    @Column(name = "isFavorite", nullable = true, insertable = true, updatable = true)
+    public boolean isFavorite;
 
     /**
-     * 好评率
+     * 内容的图片链接列表
      */
-    @Basic
-    @Column(name = "goodCommentPercent", nullable = true, insertable = true, updatable = true)
-    public double goodCommentPercent;
+    @ElementCollection
+    @Column(columnDefinition = "MEDIUMTEXT", name = "contentImageUrlList", nullable = true, insertable = true, updatable = true)
+    public List<String> contentImageUrlList;
 
     /**
      * 创建时间

+ 15 - 0
src/main/java/com/miekir/shibei/bean/db/MagnetBean.java

@@ -0,0 +1,15 @@
+package com.miekir.shibei.bean.db;
+
+import java.io.Serializable;
+
+/**
+ * Copyright (C), 2019-2020, Miekir
+ *
+ * @author Miekir
+ * @date 2020/8/28 16:25
+ * Description: 磁力链接
+ */
+public class MagnetBean implements Serializable {
+    public String text;
+    public String url;
+}

+ 13 - 2
src/main/java/com/miekir/shibei/controller/task/OnStartServer.java

@@ -1,6 +1,9 @@
 package com.miekir.shibei.controller.task;
 
+import com.miekir.shibei.bean.db.GoodsBean;
+import com.miekir.shibei.repository.GoodsRepository;
 import com.miekir.shibei.tool.web.Sex8Tool;
+import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.boot.ApplicationArguments;
 import org.springframework.boot.ApplicationRunner;
 import org.springframework.boot.CommandLineRunner;
@@ -14,11 +17,19 @@ import java.util.Arrays;
 import java.util.concurrent.Executors;
 
 /**
+ * todo 真正上线之后就不要启动这个任务了
  * spring mvc web应用启动时就执行特定处理
  *
  */
 @Component
-public class OnStartServer {
+public class CrawlerService {
+    @Autowired
+    private GoodsRepository goodsRepository;
+
+    public void saveGoods(GoodsBean goodsBean) {
+        goodsRepository.save(goodsBean);
+    }
+
     @PostConstruct
     public void init() {
         // 调试的时候会执行两次,真正部署时其实是执行一次
@@ -29,7 +40,7 @@ public class OnStartServer {
                 // 递归会导致StackOverflowError,所以用循环代替
                 boolean shouldContinue = true;
                 while (shouldContinue) {
-                    shouldContinue = Sex8Tool.startGettingData();
+                    shouldContinue = Sex8Tool.startGettingData(CrawlerService.this);
                 }
             }
         });

+ 76 - 15
src/main/java/com/miekir/shibei/tool/web/Sex8Tool.java

@@ -1,10 +1,16 @@
 package com.miekir.shibei.tool.web;
 
+import com.miekir.shibei.bean.db.GoodsBean;
+import com.miekir.shibei.bean.db.MagnetBean;
+import com.miekir.shibei.controller.task.CrawlerService;
+import com.miekir.shibei.repository.GoodsRepository;
+import com.miekir.shibei.tool.TextUtils;
 import com.sun.org.slf4j.internal.Logger;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
+import org.springframework.beans.factory.annotation.Autowired;
 
 import java.io.BufferedReader;
 import java.io.InputStream;
@@ -12,12 +18,14 @@ import java.io.InputStreamReader;
 import java.net.HttpURLConnection;
 import java.net.ProtocolException;
 import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Random;
 import java.util.logging.Level;
 import java.util.zip.GZIPInputStream;
 
 public class Sex8Tool {
-    private static final String BASIC_URL = "http://sohumayun.space/";
+    public static final String BASIC_URL = "http://sohumayun.space/";
     private static final String PAGE_URL_FORMAT = BASIC_URL + "forum-96-%s.html";
 
     private Sex8Tool(){}
@@ -121,27 +129,31 @@ public class Sex8Tool {
      * 开始爬目录
      * @return
      */
-    public static boolean startGettingData() {
+    public static boolean startGettingData(CrawlerService service) {
         Document document;
         try {
             // 获取到当前页的内容
             String pageUrl = String.format(PAGE_URL_FORMAT, mCurrentPage);
             String content = getHtmlString(pageUrl);
+            // 如果这里解析很慢(总是走不到下一步),可能是jsoup jar包没有导入到tomcat的lib目录
             document = Jsoup.parse(content);
             if (document != null) {
                 // 解析当前页,获取每一项的详情页
-                Elements itemElementList = document.getElementsByClass("icn");
-                if (itemElementList != null && itemElementList.size() > 0) {
-                    for (Element element : itemElementList) {
-                        // 解析获取item对应的详情URL
-                        Elements tagAElementList = element.getElementsByTag("a");
-                        if (tagAElementList != null && tagAElementList.size() > 0) {
-                            String detailUrl = BASIC_URL + tagAElementList.get(0).attr("href");
-                            // 开始爬详情
-                            getDetailData(detailUrl);
+                String title = getTitle(document);
+                if (!TextUtils.isEmpty(title)) {
+                    Elements itemElementList = document.getElementsByClass("icn");
+                    if (itemElementList != null && itemElementList.size() > 0) {
+                        for (Element element : itemElementList) {
+                            // 解析获取item对应的详情URL
+                            Elements tagAElementList = element.getElementsByTag("a");
+                            if (tagAElementList != null && tagAElementList.size() > 0) {
+                                String detailUrl = BASIC_URL + tagAElementList.get(0).attr("href");
+                                // 开始爬详情
+                                getDetailData(service, detailUrl, title);
+                            }
+                            // 爬完一个详情,歇一歇
+                            justWait();
                         }
-                        // 爬完一个详情,歇一歇
-                        justWait();
                     }
                 }
             }
@@ -154,11 +166,20 @@ public class Sex8Tool {
         return mCurrentPage != 0;
     }
 
+    private static String getTitle(Document document) {
+        Elements itemElementList = document.getElementsByClass("s xst");
+        if (itemElementList != null && itemElementList.size() > 0) {
+            return itemElementList.get(0).text();
+        }
+
+        return null;
+    }
+
     /**
      * 开始爬详情
      * @return
      */
-    public static void getDetailData(String detailUrl) {
+    public static void getDetailData(CrawlerService service, String detailUrl, String title) {
         Document document;
         try {
             // 获取到当前页的内容
@@ -169,14 +190,54 @@ public class Sex8Tool {
             }
 
             // 解析详情页
+            List<String> photoList = new ArrayList<String>();
             Elements photoElementList = document.getElementsByClass("zoom");
             if (photoElementList != null && photoElementList.size() > 0) {
                 for (Element element : photoElementList) {
                     String imageUrl = element.attr("file");
                     // 保存图片
-                    WebImageTool.autoSaveImage(imageUrl);
+                    String photoUrl = WebImageTool.autoSaveImage(imageUrl);
+                    if (!TextUtils.isEmpty(photoUrl)) {
+                        photoList.add(photoUrl);
+                    }
+                }
+            }
+
+            List<MagnetBean> magnetBeanList = new ArrayList<MagnetBean>();
+            Elements magnetElementList = document.getElementsByClass("attnm");
+            if (magnetElementList != null && magnetElementList.size() > 0) {
+                Elements magnetInfoElementList = magnetElementList.get(0).getElementsByTag("a");
+                if (magnetInfoElementList != null && magnetInfoElementList.size() > 0) {
+                    MagnetBean magnetBean = new MagnetBean();
+                    magnetBean.text = magnetInfoElementList.get(0).text();
+                    // todo 客户端查找的时候,返回去的要拼上BASIC_URL
+                    magnetBean.url = magnetInfoElementList.get(0).attr("href");
+                    magnetBeanList.add(magnetBean);
                 }
             }
+
+
+            String description = "暂无描述";
+            Elements descElementList = document.getElementsByClass("b_pr");
+            if (descElementList != null && descElementList.size() > 0) {
+                // todo 做正则剔除,或者客户端使用的时候,要使用WebView加载。先尝试WebView加载的
+                description = descElementList.get(0).parent().text();
+            }
+
+            // 保存到数据库
+            if (photoList.size() > 0 && magnetBeanList.size() > 0) {
+                GoodsBean goodsBean = new GoodsBean();
+                goodsBean.title = title;
+                goodsBean.description = description;
+                goodsBean.magnetBeanList = magnetBeanList;
+
+                goodsBean.coverImageUrl = photoList.get(0);
+                goodsBean.createTimeMillis = System.currentTimeMillis();
+                goodsBean.updateTimeMillis = System.currentTimeMillis();
+                goodsBean.enable = true;
+
+                service.saveGoods(goodsBean);
+            }
         } catch (Exception e) {
             e.printStackTrace();
         }