diff --git a/pom.xml b/pom.xml
index 1e16a891..d48ec2e2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -12,8 +12,8 @@
jar
UTF-8
- 1.6
- 1.6
+ 21
+ 21
diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java
index 770d07f9..00a86aee 100644
--- a/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java
+++ b/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java
@@ -34,6 +34,8 @@
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static cn.edu.hfut.dmic.contentextractor.NewsExtractor.getNewsByHtml;
+
/**
* ContentExtractor could extract content,title,time from news webpage
@@ -50,6 +52,8 @@ public class ContentExtractor {
this.doc = doc;
}
+ TextAnalyzer textAnalyzer=new TextAnalyzer();
+
protected HashMap infoMap = new HashMap();
class CountInfo {
@@ -122,7 +126,11 @@ protected CountInfo computeInfo(Node node) {
protected double computeScore(Element tag) {
CountInfo countInfo = infoMap.get(tag);
double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
- double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
+// double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
+ double logVar = Math.log(var);
+ double adjustedTextCount = Math.log(countInfo.textCount - countInfo.linkTextCount + 1);
+ double adjustedPCount = Math.log10(countInfo.pCount + 2);
+ double score = logVar * countInfo.densitySum * adjustedTextCount * adjustedPCount;
return score;
}
@@ -168,38 +176,16 @@ public Element getContentElement() throws Exception {
return content;
}
- public News getNews() throws Exception {
- News news = new News();
- Element contentElement;
- try {
- contentElement = getContentElement();
- news.setContentElement(contentElement);
- } catch (Exception ex) {
- LOG.info("news content extraction failed,extraction abort", ex);
- throw new Exception(ex);
- }
-
- if (doc.baseUri() != null) {
- news.setUrl(doc.baseUri());
- }
-
- try {
- news.setTime(getTime(contentElement));
- } catch (Exception ex) {
- LOG.info("news title extraction failed", ex);
- }
-
- try {
- news.setTitle(getTitle(contentElement));
- } catch (Exception ex) {
- LOG.info("title extraction failed", ex);
+ protected String getTime(Element contentElement) throws Exception {
+ Element searchedElement = climbDOMTree(contentElement);
+ String time = findTimeInElement(searchedElement);
+ if (time != null) {
+ return time;
}
- return news;
+ return handleTimeExtractionFailure(contentElement);
}
- protected String getTime(Element contentElement) throws Exception {
- String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
- Pattern pattern = Pattern.compile(regex);
+ private Element climbDOMTree(Element contentElement) {
Element current = contentElement;
for (int i = 0; i < 2; i++) {
if (current != null && current != doc.body()) {
@@ -209,26 +195,34 @@ protected String getTime(Element contentElement) throws Exception {
}
}
}
+ return current;
+ }
+
+ private String findTimeInElement(Element element) {
+ String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
+ Pattern pattern = Pattern.compile(regex);
for (int i = 0; i < 6; i++) {
- if (current == null) {
+ if (element == null) {
break;
}
- String currentHtml = current.outerHtml();
+ String currentHtml = element.outerHtml();
Matcher matcher = pattern.matcher(currentHtml);
if (matcher.find()) {
return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
}
- if (current != doc.body()) {
- current = current.parent();
+ if (element != doc.body()) {
+ element = element.parent();
}
}
+ return null;
+ }
+ private String handleTimeExtractionFailure(Element contentElement) throws Exception {
try {
return getDate(contentElement);
} catch (Exception ex) {
- throw new Exception("time not found");
+ throw new Exception("Time not found");
}
-
}
protected String getDate(Element contentElement) throws Exception {
@@ -259,24 +253,6 @@ protected String getDate(Element contentElement) throws Exception {
throw new Exception("date not found");
}
- protected double strSim(String a, String b) {
- int len1 = a.length();
- int len2 = b.length();
- if (len1 == 0 || len2 == 0) {
- return 0;
- }
- double ratio;
- if (len1 > len2) {
- ratio = (len1 + 0.0) / len2;
- } else {
- ratio = (len2 + 0.0) / len1;
- }
- if (ratio >= 3) {
- return 0;
- }
- return (lcs(a, b) + 0.0) / Math.max(len1, len2);
- }
-
protected String getTitle(final Element contentElement) throws Exception {
final ArrayList titleList = new ArrayList();
final ArrayList titleSim = new ArrayList();
@@ -295,7 +271,7 @@ public void head(Node node, int i) {
String tagName = tag.tagName();
if (Pattern.matches("h[1-6]", tagName)) {
String title = tag.text().trim();
- double sim = strSim(title, metaTitle);
+ double sim = textAnalyzer.strSim(title, metaTitle);
titleSim.add(sim);
titleList.add(tag);
}
@@ -351,7 +327,7 @@ public void head(Node node, int i) {
if (node instanceof TextNode) {
TextNode tn = (TextNode) node;
String text = tn.text().trim();
- double sim = strSim(text, metaTitle);
+ double sim = textAnalyzer.strSim(text, metaTitle);
if (sim > 0) {
if (sim > max.get(0)) {
max.set(0, sim);
@@ -396,42 +372,6 @@ protected int lcs(String x, String y) {
}
- protected int editDistance(String word1, String word2) {
- int len1 = word1.length();
- int len2 = word2.length();
-
- int[][] dp = new int[len1 + 1][len2 + 1];
-
- for (int i = 0; i <= len1; i++) {
- dp[i][0] = i;
- }
-
- for (int j = 0; j <= len2; j++) {
- dp[0][j] = j;
- }
-
- for (int i = 0; i < len1; i++) {
- char c1 = word1.charAt(i);
- for (int j = 0; j < len2; j++) {
- char c2 = word2.charAt(j);
-
- if (c1 == c2) {
- dp[i + 1][j + 1] = dp[i][j];
- } else {
- int replace = dp[i][j] + 1;
- int insert = dp[i][j + 1] + 1;
- int delete = dp[i + 1][j] + 1;
-
- int min = replace > insert ? insert : replace;
- min = delete > min ? min : delete;
- dp[i + 1][j + 1] = min;
- }
- }
- }
-
- return dp[len1][len2];
- }
-
/*输入Jsoup的Document,获取正文所在Element*/
public static Element getContentElementByDoc(Document doc) throws Exception {
ContentExtractor ce = new ContentExtractor(doc);
@@ -485,24 +425,6 @@ public static String getContentByUrl(String url) throws Exception {
return getContentByHtml(html, url);
}
- /*输入Jsoup的Document,获取结构化新闻信息*/
- public static News getNewsByDoc(Document doc) throws Exception {
- ContentExtractor ce = new ContentExtractor(doc);
- return ce.getNews();
- }
-
- /*输入HTML,获取结构化新闻信息*/
- public static News getNewsByHtml(String html) throws Exception {
- Document doc = Jsoup.parse(html);
- return getNewsByDoc(doc);
- }
-
- /*输入HTML和URL,获取结构化新闻信息*/
- public static News getNewsByHtml(String html, String url) throws Exception {
- Document doc = Jsoup.parse(html, url);
- return getNewsByDoc(doc);
- }
-
/*输入URL,获取结构化新闻信息*/
public static News getNewsByUrl(String url) throws Exception {
// HttpRequest request = new HttpRequest(url);
diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java
new file mode 100644
index 00000000..8706ed11
--- /dev/null
+++ b/src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java
@@ -0,0 +1,69 @@
+package cn.edu.hfut.dmic.contentextractor;
+
+import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class NewsExtractor {
+ News news=new News();
+
+ public static OkHttpRequester okHttpRequester = new OkHttpRequester();
+ private static final Logger LOG = LoggerFactory.getLogger(NewsExtractor.class);
+
+ private Document doc;
+ private ContentExtractor contentExtractor;
+
+ public NewsExtractor(Document doc) {
+ this.contentExtractor = new ContentExtractor(doc);
+ }
+
+ public News getNews() throws Exception {
+ News news = new News();
+ Element contentElement;
+ try {
+ contentElement = contentExtractor.getContentElement();
+ news.setContentElement(contentElement);
+ } catch (Exception ex) {
+ LOG.info("news content extraction failed,extraction abort", ex);
+ throw new Exception(ex);
+ }
+
+ if (doc.baseUri() != null) {
+ news.setUrl(doc.baseUri());
+ }
+
+ try {
+ news.setTime(contentExtractor.getTime(contentElement));
+ } catch (Exception ex) {
+ LOG.info("news title extraction failed", ex);
+ }
+
+ try {
+ news.setTitle(contentExtractor.getTitle(contentElement));
+ } catch (Exception ex) {
+ LOG.info("title extraction failed", ex);
+ }
+ return news;
+ }
+
+ public static News getNewsByDoc(Document doc) throws Exception {
+ NewsExtractor newsExtractor=new NewsExtractor(doc);
+ return newsExtractor.getNews();
+ }
+
+ /*输入HTML,获取结构化新闻信息*/
+ public static News getNewsByHtml(String html) throws Exception {
+ Document doc = Jsoup.parse(html);
+ return getNewsByDoc(doc);
+ }
+
+ /*输入HTML和URL,获取结构化新闻信息*/
+ public static News getNewsByHtml(String html, String url) throws Exception {
+ Document doc = Jsoup.parse(html, url);
+ return getNewsByDoc(doc);
+ }
+
+}
diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java
new file mode 100644
index 00000000..05331e3d
--- /dev/null
+++ b/src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java
@@ -0,0 +1,78 @@
+package cn.edu.hfut.dmic.contentextractor;
+
+public class TextAnalyzer {
+ protected double strSim(String a, String b) {
+ int len1 = a.length();
+ int len2 = b.length();
+ if (len1 == 0 || len2 == 0) {
+ return 0;
+ }
+ double ratio;
+ if (len1 > len2) {
+ ratio = (len1 + 0.0) / len2;
+ } else {
+ ratio = (len2 + 0.0) / len1;
+ }
+ if (ratio >= 3) {
+ return 0;
+ }
+ return (lcs(a, b) + 0.0) / Math.max(len1, len2);
+ }
+
+ protected int lcs(String x, String y) {
+
+ int M = x.length();
+ int N = y.length();
+ if (M == 0 || N == 0) {
+ return 0;
+ }
+ int[][] opt = new int[M + 1][N + 1];
+
+ for (int i = M - 1; i >= 0; i--) {
+ for (int j = N - 1; j >= 0; j--) {
+ if (x.charAt(i) == y.charAt(j)) {
+ opt[i][j] = opt[i + 1][j + 1] + 1;
+ } else {
+ opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);
+ }
+ }
+ }
+ return opt[0][0];
+ }
+
+ protected int editDistance(String word1, String word2) {
+ int len1 = word1.length();
+ int len2 = word2.length();
+
+ int[][] dp = new int[len1 + 1][len2 + 1];
+
+ for (int i = 0; i <= len1; i++) {
+ dp[i][0] = i;
+ }
+
+ for (int j = 0; j <= len2; j++) {
+ dp[0][j] = j;
+ }
+
+ for (int i = 0; i < len1; i++) {
+ char c1 = word1.charAt(i);
+ for (int j = 0; j < len2; j++) {
+ char c2 = word2.charAt(j);
+
+ if (c1 == c2) {
+ dp[i + 1][j + 1] = dp[i][j];
+ } else {
+ int replace = dp[i][j] + 1;
+ int insert = dp[i][j + 1] + 1;
+ int delete = dp[i + 1][j] + 1;
+
+ int min = replace > insert ? insert : replace;
+ min = delete > min ? min : delete;
+ dp[i + 1][j + 1] = min;
+ }
+ }
+ }
+
+ return dp[len1][len2];
+ }
+}
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java b/src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java
index 42b2f9e7..3d743019 100644
--- a/src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java
@@ -1,6 +1,7 @@
package cn.edu.hfut.dmic.webcollector.fetcher;
import cn.edu.hfut.dmic.webcollector.conf.DefaultConfigured;
+import cn.edu.hfut.dmic.webcollector.util.RegexRule;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
@@ -242,7 +243,9 @@ protected void parseLink(Page page, CrawlDatums next) {
if (conteType != null && conteType.contains("text/html")) {
Document doc = page.doc();
if (doc != null) {
- Links links = new Links().addByRegex(doc, regexRule, getConf().getAutoDetectImg());
+ RegexRule regexRule = new RegexRule(); // Presuming you have set the regex patterns somewhere
+
+ Links links = regexRule.addByRegex(doc, regexRule, getConf().getAutoDetectImg());
next.add(links);
}
}
@@ -267,8 +270,7 @@ protected void parseLink(Page page, CrawlDatums next) {
// }
// };
//
-// VisitorMethodDispatcher visitorMethodDispatcher = new VisitorMethodDispatcher(visitor);
-//
+ //
// }
//
//
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatum.java b/src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatum.java
index 0a33c5eb..5e3a3554 100644
--- a/src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatum.java
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatum.java
@@ -16,10 +16,9 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package cn.edu.hfut.dmic.webcollector.model;
-
-import cn.edu.hfut.dmic.webcollector.util.CrawlDatumFormater;
import cn.edu.hfut.dmic.webcollector.util.GsonUtils;
import cn.edu.hfut.dmic.webcollector.util.RegexRule;
+import cn.edu.hfut.dmic.webcollector.util.StatusFormatter;
import com.google.gson.*;
import java.io.Serializable;
@@ -65,6 +64,17 @@ public class CrawlDatum implements Serializable, MetaGetter, MetaSetter iterator() {
return dataList.iterator();
}
- public Links filterByRegex(RegexRule regexRule) {
- Iterator ite = iterator();
- while(ite.hasNext()){
- String url = ite.next();
- if (!regexRule.satisfy(url)) {
- ite.remove();
- }
- }
- return this;
- }
-
- public Links filterByRegex(String regex) {
- RegexRule regexRule = new RegexRule();
- regexRule.addRule(regex);
- return filterByRegex(regexRule);
- }
+// public Links filterByRegex(RegexRule regexRule) {
+// Iterator ite = iterator();
+// while(ite.hasNext()){
+// String url = ite.next();
+// if (!regexRule.satisfy(url)) {
+// ite.remove();
+// }
+// }
+// return this;
+// }
+//
+// public Links filterByRegex(String regex) {
+// RegexRule regexRule = new RegexRule();
+// regexRule.addRule(regex);
+// return filterByRegex(regexRule);
+// }
public Links addFromElement(Element ele) {
addFromElement(ele,false);
@@ -131,34 +131,34 @@ public Links addBySelector(Element ele, String cssSelector){
return addBySelector(ele ,cssSelector,false);
}
- public Links addByRegex(Element ele, RegexRule regexRule, boolean parseSrc) {
- for(String href: ele.select("a[href]").eachAttr("abs:href")){
- if (regexRule.satisfy(href)) {
- this.add(href);
- }
- }
- if(parseSrc) {
- for (String src : ele.select("*[src]").eachAttr("abs:src")){
- if(regexRule.satisfy(src)){
- this.add(src);
- }
- }
- }
- return this;
- }
-
- public Links addByRegex(Element ele, RegexRule regexRule) {
- return addByRegex(ele, regexRule, false);
- }
-
- public Links addByRegex(Element ele, String regex, boolean parseSrc) {
- RegexRule regexRule = new RegexRule(regex);
- return addByRegex(ele, regexRule, parseSrc);
- }
- public Links addByRegex(Element ele, String regex) {
- RegexRule regexRule = new RegexRule(regex);
- return addByRegex(ele,regexRule,false);
- }
+// public Links addByRegex(Element ele, RegexRule regexRule, boolean parseSrc) {
+// for(String href: ele.select("a[href]").eachAttr("abs:href")){
+// if (regexRule.satisfy(href)) {
+// this.add(href);
+// }
+// }
+// if(parseSrc) {
+// for (String src : ele.select("*[src]").eachAttr("abs:src")){
+// if(regexRule.satisfy(src)){
+// this.add(src);
+// }
+// }
+// }
+// return this;
+// }
+
+// public Links addByRegex(Element ele, RegexRule regexRule) {
+// return addByRegex(ele, regexRule, false);
+// }
+//
+// public Links addByRegex(Element ele, String regex, boolean parseSrc) {
+// RegexRule regexRule = new RegexRule(regex);
+// return addByRegex(ele, regexRule, parseSrc);
+// }
+// public Links addByRegex(Element ele, String regex) {
+// RegexRule regexRule = new RegexRule(regex);
+// return addByRegex(ele,regexRule,false);
+// }
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/model/Page.java b/src/main/java/cn/edu/hfut/dmic/webcollector/model/Page.java
index 38fd1f52..e77726e0 100644
--- a/src/main/java/cn/edu/hfut/dmic/webcollector/model/Page.java
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/model/Page.java
@@ -16,6 +16,7 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package cn.edu.hfut.dmic.webcollector.model;
+import cn.edu.hfut.dmic.webcollector.util.RegexRule;
import cn.edu.hfut.dmic.webcollector.util.CharsetDetector;
import java.io.UnsupportedEncodingException;
@@ -189,10 +190,11 @@ public Links links(String cssSelector) {
public Links regexLinks(RegexRule regexRule, boolean parseSrc) {
- return new Links().addByRegex(doc(), regexRule, parseSrc);
+ return regexRule.addByRegex(doc(), regexRule, parseSrc);
}
public Links regexLinks(String regex, boolean parseSrc){
- return new Links().addByRegex(doc(),regex,parseSrc);
+ RegexRule regexRule=new RegexRule();
+ return regexRule.addByRegex(doc(),regex,parseSrc);
}
public Links regexLinks(RegexRule regexRule) {
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/CharsetDetector.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/CharsetDetector.java
index 38816d65..5fc4395b 100644
--- a/src/main/java/cn/edu/hfut/dmic/webcollector/util/CharsetDetector.java
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/CharsetDetector.java
@@ -67,22 +67,36 @@ private static String guessEncodingByNutch(byte[] content) {
}
}
if (encoding == null) {
- if (length >= 3 && content[0] == (byte) 0xEF
- && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
+ if (isUTF8Signature(content, length)) {
encoding = "UTF-8";
- } else if (length >= 2) {
- if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
- encoding = "UTF-16LE";
- } else if (content[0] == (byte) 0xFE
- && content[1] == (byte) 0xFF) {
- encoding = "UTF-16BE";
- }
+ } else if (isUTF16LESignature(content, length)) {
+ encoding = "UTF-16LE";
+ } else if (isUTF16BESignature(content, length)) {
+ encoding = "UTF-16BE";
}
}
-
return encoding;
}
+ private static boolean isUTF8Signature(byte[] content, int length) {
+ return length >= 3
+ && content[0] == (byte) 0xEF
+ && content[1] == (byte) 0xBB
+ && content[2] == (byte) 0xBF;
+ }
+
+ private static boolean isUTF16LESignature(byte[] content, int length) {
+ return length >= 2
+ && content[0] == (byte) 0xFF
+ && content[1] == (byte) 0xFE;
+ }
+
+ private static boolean isUTF16BESignature(byte[] content, int length) {
+ return length >= 2
+ && content[0] == (byte) 0xFE
+ && content[1] == (byte) 0xFF;
+ }
+
/**
* 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8
*
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/CrawlDatumFormater.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/CrawlDatumFormater.java
index 2c32bf99..377d279b 100644
--- a/src/main/java/cn/edu/hfut/dmic/webcollector/util/CrawlDatumFormater.java
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/CrawlDatumFormater.java
@@ -42,18 +42,7 @@ public static String datumToString(CrawlDatum datum) {
.append("\nURL: ").append(datum.url())
.append("\nSTATUS: ");
- switch (datum.getStatus()) {
- case CrawlDatum.STATUS_DB_SUCCESS:
- sb.append("success");
- break;
- case CrawlDatum.STATUS_DB_FAILED:
- sb.append("failed");
- break;
- case CrawlDatum.STATUS_DB_UNEXECUTED:
- sb.append("unexecuted");
- break;
- }
-
+ String status = getStatusFormatter(datum.getStatus()).format(datum);
sb.append("\nExecuteTime: ")
.append(sdf.format(new Date(datum.getExecuteTime())))
.append("\nExecuteCount: ").append(datum.getExecuteCount())
@@ -76,6 +65,19 @@ public static String datumToString(CrawlDatum datum) {
return sb.toString();
}
+ private static StatusFormatter getStatusFormatter(int status) {
+ switch (status) {
+ case CrawlDatum.STATUS_DB_SUCCESS:
+ return new SuccessStatusFormatter();
+ case CrawlDatum.STATUS_DB_FAILED:
+ return new FailedStatusFormatter();
+ case CrawlDatum.STATUS_DB_UNEXECUTED:
+ return new UnexecutedStatusFormatter();
+ default:
+ throw new IllegalArgumentException("Unknown status: " + status);
+ }
+ }
+
// public static CrawlDatum jsonStrToDatum(String crawlDatumKey, String jsonStr) {
// JsonArray jsonArray = GsonUtils.parse(jsonStr).getAsJsonArray();
//
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/FailedStatusFormatter.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/FailedStatusFormatter.java
new file mode 100644
index 00000000..efa5b776
--- /dev/null
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/FailedStatusFormatter.java
@@ -0,0 +1,9 @@
+package cn.edu.hfut.dmic.webcollector.util;
+
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
+
+public class FailedStatusFormatter implements StatusFormatter{
+ public String format(CrawlDatum datum) {
+ return "failed";
+ }
+}
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/RegexRule.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/RegexRule.java
index ea212d37..74f3b2bb 100644
--- a/src/main/java/cn/edu/hfut/dmic/webcollector/util/RegexRule.java
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/RegexRule.java
@@ -18,16 +18,24 @@
package cn.edu.hfut.dmic.webcollector.util;
+import cn.edu.hfut.dmic.webcollector.model.Links;
+import org.jsoup.nodes.Element;
+
import java.util.ArrayList;
+import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
+import static java.util.Spliterators.iterator;
+
/**
*
* @author hu
*/
public class RegexRule {
-
+
+ Links links=new Links();
+
public RegexRule(){
}
@@ -129,6 +137,62 @@ public boolean satisfy(String str) {
} else {
return true;
}
+ }
+
+
+ public Links filterByRegex(String regex) {
+ RegexRule regexRule = new RegexRule();
+ regexRule.addRule(regex);
+ return filterByRegex(regexRule);
+ }
+
+ public Links filterByRegex(RegexRule regexRule) {
+ Iterator ite = iterator();
+ while(ite.hasNext()){
+ String url = ite.next();
+ if (!regexRule.satisfy(url)) {
+ ite.remove();
+ }
+ }
+ return links;
+ }
+
+ private Iterator iterator() {
+ return null;
+ }
+
+
+
+ public Links addByRegex(Element ele, String regex, boolean parseSrc) {
+ RegexRule regexRule = new RegexRule(regex);
+ return addByRegex(ele, regexRule, parseSrc);
+ }
+
+ public Links addByRegex(Element ele, String regex) {
+ RegexRule regexRule = new RegexRule(regex);
+ return addByRegex(ele,regexRule,false);
+ }
+ public Links addByRegex(Element ele, RegexRule regexRule) {
+ return addByRegex(ele, regexRule, false);
+ }
+
+ public Links addByRegex(Element ele, RegexRule regexRule, boolean parseSrc) {
+ // This method should use the regexRule parameter to add links to the Links object.
+ Links links = new Links();
+ for (String href : ele.select("a[href]").eachAttr("abs:href")) {
+ if (regexRule.satisfy(href)) {
+ links.add(href);
+ }
+ }
+ if (parseSrc) {
+ for (String src : ele.select("img[src]").eachAttr("abs:src")) {
+ if (regexRule.satisfy(src)) {
+ links.add(src);
+ }
+ }
+ }
+ return links;
}
}
+
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/StatusFormatter.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/StatusFormatter.java
new file mode 100644
index 00000000..25f06706
--- /dev/null
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/StatusFormatter.java
@@ -0,0 +1,7 @@
+package cn.edu.hfut.dmic.webcollector.util;
+
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
+
+public interface StatusFormatter {
+ String format(CrawlDatum datum);
+}
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/SuccessStatusFormatter.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/SuccessStatusFormatter.java
new file mode 100644
index 00000000..17df2953
--- /dev/null
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/SuccessStatusFormatter.java
@@ -0,0 +1,9 @@
+package cn.edu.hfut.dmic.webcollector.util;
+
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
+
+public class SuccessStatusFormatter implements StatusFormatter{
+ public String format(CrawlDatum datum) {
+ return "success";
+ }
+}
diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/UnexecutedStatusFormatter.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/UnexecutedStatusFormatter.java
new file mode 100644
index 00000000..2c96e3f5
--- /dev/null
+++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/UnexecutedStatusFormatter.java
@@ -0,0 +1,9 @@
+package cn.edu.hfut.dmic.webcollector.util;
+
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
+
+public class UnexecutedStatusFormatter implements StatusFormatter{
+ public String format(CrawlDatum datum) {
+ return "unexecuted";
+ }
+}