CrawlScript · bhavya844 · Mar 31, 2024
diff --git a/pom.xml b/pom.xml
@@ -12,8 +12,8 @@
     <packaging>jar</packaging>
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-        <maven.compiler.source>1.6</maven.compiler.source>
-        <maven.compiler.target>1.6</maven.compiler.target>
+        <maven.compiler.source>21</maven.compiler.source>
+        <maven.compiler.target>21</maven.compiler.target>
     </properties>
 
     <licenses>

diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java
@@ -34,6 +34,8 @@
 import org.jsoup.select.NodeVisitor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import static cn.edu.hfut.dmic.contentextractor.NewsExtractor.getNewsByHtml;
+
 
 /**
  * ContentExtractor could extract content,title,time from news webpage
@@ -50,6 +52,8 @@ public class ContentExtractor {
         this.doc = doc;
     }
 
+    TextAnalyzer textAnalyzer=new TextAnalyzer();
+
     protected HashMap<Element, CountInfo> infoMap = new HashMap<Element, CountInfo>();
 
     class CountInfo {
@@ -122,7 +126,11 @@ protected CountInfo computeInfo(Node node) {
     protected double computeScore(Element tag) {
         CountInfo countInfo = infoMap.get(tag);
         double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
-        double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
+//        double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
+        double logVar = Math.log(var);
+        double adjustedTextCount = Math.log(countInfo.textCount - countInfo.linkTextCount + 1);
+        double adjustedPCount = Math.log10(countInfo.pCount + 2);
+        double score = logVar * countInfo.densitySum * adjustedTextCount * adjustedPCount;
         return score;
     }
 
@@ -168,38 +176,16 @@ public Element getContentElement() throws Exception {
         return content;
     }
 
-    public News getNews() throws Exception {
-        News news = new News();
-        Element contentElement;
-        try {
-            contentElement = getContentElement();
-            news.setContentElement(contentElement);
-        } catch (Exception ex) {
-            LOG.info("news content extraction failed,extraction abort", ex);
-            throw new Exception(ex);
-        }
-
-        if (doc.baseUri() != null) {
-            news.setUrl(doc.baseUri());
-        }
-
-        try {
-            news.setTime(getTime(contentElement));
-        } catch (Exception ex) {
-            LOG.info("news title extraction failed", ex);
-        }
-
-        try {
-            news.setTitle(getTitle(contentElement));
-        } catch (Exception ex) {
-            LOG.info("title extraction failed", ex);
+    protected String getTime(Element contentElement) throws Exception {
+        Element searchedElement = climbDOMTree(contentElement);
+        String time = findTimeInElement(searchedElement);
+        if (time != null) {
+            return time;
         }
-        return news;
+        return handleTimeExtractionFailure(contentElement);
     }
 
-    protected String getTime(Element contentElement) throws Exception {
-        String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
-        Pattern pattern = Pattern.compile(regex);
+    private Element climbDOMTree(Element contentElement) {
         Element current = contentElement;
         for (int i = 0; i < 2; i++) {
             if (current != null && current != doc.body()) {
@@ -209,26 +195,34 @@ protected String getTime(Element contentElement) throws Exception {
                 }
             }
         }
+        return current;
+    }
+
+    private String findTimeInElement(Element element) {
+        String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
+        Pattern pattern = Pattern.compile(regex);
         for (int i = 0; i < 6; i++) {
-            if (current == null) {
+            if (element == null) {
                 break;
             }
-            String currentHtml = current.outerHtml();
+            String currentHtml = element.outerHtml();
             Matcher matcher = pattern.matcher(currentHtml);
             if (matcher.find()) {
                 return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
             }
-            if (current != doc.body()) {
-                current = current.parent();
+            if (element != doc.body()) {
+                element = element.parent();
             }
         }
+        return null;
+    }
 
+    private String handleTimeExtractionFailure(Element contentElement) throws Exception {
         try {
             return getDate(contentElement);
         } catch (Exception ex) {
-            throw new Exception("time not found");
+            throw new Exception("Time not found");
         }
-
     }
 
     protected String getDate(Element contentElement) throws Exception {
@@ -259,24 +253,6 @@ protected String getDate(Element contentElement) throws Exception {
         throw new Exception("date not found");
     }
 
-    protected double strSim(String a, String b) {
-        int len1 = a.length();
-        int len2 = b.length();
-        if (len1 == 0 || len2 == 0) {
-            return 0;
-        }
-        double ratio;
-        if (len1 > len2) {
-            ratio = (len1 + 0.0) / len2;
-        } else {
-            ratio = (len2 + 0.0) / len1;
-        }
-        if (ratio >= 3) {
-            return 0;
-        }
-        return (lcs(a, b) + 0.0) / Math.max(len1, len2);
-    }
-
     protected String getTitle(final Element contentElement) throws Exception {
         final ArrayList<Element> titleList = new ArrayList<Element>();
         final ArrayList<Double> titleSim = new ArrayList<Double>();
@@ -295,7 +271,7 @@ public void head(Node node, int i) {
                         String tagName = tag.tagName();
                         if (Pattern.matches("h[1-6]", tagName)) {
                             String title = tag.text().trim();
-                            double sim = strSim(title, metaTitle);
+                            double sim = textAnalyzer.strSim(title, metaTitle);
                             titleSim.add(sim);
                             titleList.add(tag);
                         }
@@ -351,7 +327,7 @@ public void head(Node node, int i) {
                 if (node instanceof TextNode) {
                     TextNode tn = (TextNode) node;
                     String text = tn.text().trim();
-                    double sim = strSim(text, metaTitle);
+                    double sim = textAnalyzer.strSim(text, metaTitle);
                     if (sim > 0) {
                         if (sim > max.get(0)) {
                             max.set(0, sim);
@@ -396,42 +372,6 @@ protected int lcs(String x, String y) {
 
     }
 
-    protected int editDistance(String word1, String word2) {
-        int len1 = word1.length();
-        int len2 = word2.length();
-
-        int[][] dp = new int[len1 + 1][len2 + 1];
-
-        for (int i = 0; i <= len1; i++) {
-            dp[i][0] = i;
-        }
-
-        for (int j = 0; j <= len2; j++) {
-            dp[0][j] = j;
-        }
-
-        for (int i = 0; i < len1; i++) {
-            char c1 = word1.charAt(i);
-            for (int j = 0; j < len2; j++) {
-                char c2 = word2.charAt(j);
-
-                if (c1 == c2) {
-                    dp[i + 1][j + 1] = dp[i][j];
-                } else {
-                    int replace = dp[i][j] + 1;
-                    int insert = dp[i][j + 1] + 1;
-                    int delete = dp[i + 1][j] + 1;
-
-                    int min = replace > insert ? insert : replace;
-                    min = delete > min ? min : delete;
-                    dp[i + 1][j + 1] = min;
-                }
-            }
-        }
-
-        return dp[len1][len2];
-    }
-
     /*输入Jsoup的Document，获取正文所在Element*/
     public static Element getContentElementByDoc(Document doc) throws Exception {
         ContentExtractor ce = new ContentExtractor(doc);
@@ -485,24 +425,6 @@ public static String getContentByUrl(String url) throws Exception {
         return getContentByHtml(html, url);
     }
 
-    /*输入Jsoup的Document，获取结构化新闻信息*/
-    public static News getNewsByDoc(Document doc) throws Exception {
-        ContentExtractor ce = new ContentExtractor(doc);
-        return ce.getNews();
-    }
-
-    /*输入HTML，获取结构化新闻信息*/
-    public static News getNewsByHtml(String html) throws Exception {
-        Document doc = Jsoup.parse(html);
-        return getNewsByDoc(doc);
-    }
-
-    /*输入HTML和URL，获取结构化新闻信息*/
-    public static News getNewsByHtml(String html, String url) throws Exception {
-        Document doc = Jsoup.parse(html, url);
-        return getNewsByDoc(doc);
-    }
-
     /*输入URL，获取结构化新闻信息*/
     public static News getNewsByUrl(String url) throws Exception {
 //        HttpRequest request = new HttpRequest(url);

diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java
@@ -0,0 +1,69 @@
+package cn.edu.hfut.dmic.contentextractor;
+
+import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class NewsExtractor {
+    News news=new News();
+
+    public static OkHttpRequester okHttpRequester = new OkHttpRequester();
+    private static final Logger LOG = LoggerFactory.getLogger(NewsExtractor.class);
+
+    private Document doc;
+    private ContentExtractor contentExtractor;
+
+    public NewsExtractor(Document doc) {
+        this.contentExtractor = new ContentExtractor(doc);
+    }
+
+    public News getNews() throws Exception {
+        News news = new News();
+        Element contentElement;
+        try {
+            contentElement = contentExtractor.getContentElement();
+            news.setContentElement(contentElement);
+        } catch (Exception ex) {
+            LOG.info("news content extraction failed,extraction abort", ex);
+            throw new Exception(ex);
+        }
+
+        if (doc.baseUri() != null) {
+            news.setUrl(doc.baseUri());
+        }
+
+        try {
+            news.setTime(contentExtractor.getTime(contentElement));
+        } catch (Exception ex) {
+            LOG.info("news title extraction failed", ex);
+        }
+
+        try {
+            news.setTitle(contentExtractor.getTitle(contentElement));
+        } catch (Exception ex) {
+            LOG.info("title extraction failed", ex);
+        }
+        return news;
+    }
+
+    public static News getNewsByDoc(Document doc) throws Exception {
+        NewsExtractor newsExtractor=new NewsExtractor(doc);
+        return newsExtractor.getNews();
+    }
+
+    /*输入HTML，获取结构化新闻信息*/
+    public static News getNewsByHtml(String html) throws Exception {
+        Document doc = Jsoup.parse(html);
+        return getNewsByDoc(doc);
+    }
+
+    /*输入HTML和URL，获取结构化新闻信息*/
+    public static News getNewsByHtml(String html, String url) throws Exception {
+        Document doc = Jsoup.parse(html, url);
+        return getNewsByDoc(doc);
+    }
+
+}
diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java
@@ -0,0 +1,78 @@
+package cn.edu.hfut.dmic.contentextractor;
+
+public class TextAnalyzer {
+    protected double strSim(String a, String b) {
+        int len1 = a.length();
+        int len2 = b.length();
+        if (len1 == 0 || len2 == 0) {
+            return 0;
+        }
+        double ratio;
+        if (len1 > len2) {
+            ratio = (len1 + 0.0) / len2;
+        } else {
+            ratio = (len2 + 0.0) / len1;
+        }
+        if (ratio >= 3) {
+            return 0;
+        }
+        return (lcs(a, b) + 0.0) / Math.max(len1, len2);
+    }
+
+    protected int lcs(String x, String y) {
+
+        int M = x.length();
+        int N = y.length();
+        if (M == 0 || N == 0) {
+            return 0;
+        }
+        int[][] opt = new int[M + 1][N + 1];
+
+        for (int i = M - 1; i >= 0; i--) {
+            for (int j = N - 1; j >= 0; j--) {
+                if (x.charAt(i) == y.charAt(j)) {
+                    opt[i][j] = opt[i + 1][j + 1] + 1;
+                } else {
+                    opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);
+                }
+            }
+        }
+        return opt[0][0];
+    }
+
+    protected int editDistance(String word1, String word2) {
+        int len1 = word1.length();
+        int len2 = word2.length();
+
+        int[][] dp = new int[len1 + 1][len2 + 1];
+
+        for (int i = 0; i <= len1; i++) {
+            dp[i][0] = i;
+        }
+
+        for (int j = 0; j <= len2; j++) {
+            dp[0][j] = j;
+        }
+
+        for (int i = 0; i < len1; i++) {
+            char c1 = word1.charAt(i);
+            for (int j = 0; j < len2; j++) {
+                char c2 = word2.charAt(j);
+
+                if (c1 == c2) {
+                    dp[i + 1][j + 1] = dp[i][j];
+                } else {
+                    int replace = dp[i][j] + 1;
+                    int insert = dp[i][j + 1] + 1;
+                    int delete = dp[i + 1][j] + 1;
+
+                    int min = replace > insert ? insert : replace;
+                    min = delete > min ? min : delete;
+                    dp[i + 1][j + 1] = min;
+                }
+            }
+        }
+
+        return dp[len1][len2];
+    }
+}