From cdfb5038438130165cd4b7273080021a9048a80c Mon Sep 17 00:00:00 2001 From: bhavyadave Date: Sun, 31 Mar 2024 14:29:13 -0300 Subject: [PATCH] refactor: design and implementation smells --- pom.xml | 4 +- .../contentextractor/ContentExtractor.java | 142 ++++-------------- .../dmic/contentextractor/NewsExtractor.java | 69 +++++++++ .../dmic/contentextractor/TextAnalyzer.java | 78 ++++++++++ .../fetcher/VisitorMethodDispatcher.java | 8 +- .../dmic/webcollector/model/CrawlDatum.java | 19 ++- .../webcollector/model/CrawlDatumService.java | 7 + .../hfut/dmic/webcollector/model/Links.java | 88 +++++------ .../hfut/dmic/webcollector/model/Page.java | 6 +- .../webcollector/util/CharsetDetector.java | 34 +++-- .../webcollector/util/CrawlDatumFormater.java | 26 ++-- .../util/FailedStatusFormatter.java | 9 ++ .../dmic/webcollector/util/RegexRule.java | 66 +++++++- .../webcollector/util/StatusFormatter.java | 7 + .../util/SuccessStatusFormatter.java | 9 ++ .../util/UnexecutedStatusFormatter.java | 9 ++ 16 files changed, 391 insertions(+), 190 deletions(-) create mode 100644 src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java create mode 100644 src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java create mode 100644 src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatumService.java create mode 100644 src/main/java/cn/edu/hfut/dmic/webcollector/util/FailedStatusFormatter.java create mode 100644 src/main/java/cn/edu/hfut/dmic/webcollector/util/StatusFormatter.java create mode 100644 src/main/java/cn/edu/hfut/dmic/webcollector/util/SuccessStatusFormatter.java create mode 100644 src/main/java/cn/edu/hfut/dmic/webcollector/util/UnexecutedStatusFormatter.java diff --git a/pom.xml b/pom.xml index 1e16a891..d48ec2e2 100644 --- a/pom.xml +++ b/pom.xml @@ -12,8 +12,8 @@ jar UTF-8 - 1.6 - 1.6 + 21 + 21 diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java index 770d07f9..00a86aee 100644 --- a/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java +++ b/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java @@ -34,6 +34,8 @@ import org.jsoup.select.NodeVisitor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static cn.edu.hfut.dmic.contentextractor.NewsExtractor.getNewsByHtml; + /** * ContentExtractor could extract content,title,time from news webpage @@ -50,6 +52,8 @@ public class ContentExtractor { this.doc = doc; } + TextAnalyzer textAnalyzer=new TextAnalyzer(); + protected HashMap infoMap = new HashMap(); class CountInfo { @@ -122,7 +126,11 @@ protected CountInfo computeInfo(Node node) { protected double computeScore(Element tag) { CountInfo countInfo = infoMap.get(tag); double var = Math.sqrt(computeVar(countInfo.leafList) + 1); - double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2); +// double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2); + double logVar = Math.log(var); + double adjustedTextCount = Math.log(countInfo.textCount - countInfo.linkTextCount + 1); + double adjustedPCount = Math.log10(countInfo.pCount + 2); + double score = logVar * countInfo.densitySum * adjustedTextCount * adjustedPCount; return score; } @@ -168,38 +176,16 @@ public Element getContentElement() throws Exception { return content; } - public News getNews() throws Exception { - News news = new News(); - Element contentElement; - try { - contentElement = getContentElement(); - news.setContentElement(contentElement); - } catch (Exception ex) { - LOG.info("news content extraction failed,extraction abort", ex); - throw new Exception(ex); - } - - if (doc.baseUri() != null) { - news.setUrl(doc.baseUri()); - } - - try { - news.setTime(getTime(contentElement)); - } catch (Exception ex) { - LOG.info("news title extraction failed", ex); - } - - try { - news.setTitle(getTitle(contentElement)); - } catch (Exception ex) { - LOG.info("title extraction failed", ex); + protected String getTime(Element contentElement) throws Exception { + Element searchedElement = climbDOMTree(contentElement); + String time = findTimeInElement(searchedElement); + if (time != null) { + return time; } - return news; + return handleTimeExtractionFailure(contentElement); } - protected String getTime(Element contentElement) throws Exception { - String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"; - Pattern pattern = Pattern.compile(regex); + private Element climbDOMTree(Element contentElement) { Element current = contentElement; for (int i = 0; i < 2; i++) { if (current != null && current != doc.body()) { @@ -209,26 +195,34 @@ protected String getTime(Element contentElement) throws Exception { } } } + return current; + } + + private String findTimeInElement(Element element) { + String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"; + Pattern pattern = Pattern.compile(regex); for (int i = 0; i < 6; i++) { - if (current == null) { + if (element == null) { break; } - String currentHtml = current.outerHtml(); + String currentHtml = element.outerHtml(); Matcher matcher = pattern.matcher(currentHtml); if (matcher.find()) { return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6); } - if (current != doc.body()) { - current = current.parent(); + if (element != doc.body()) { + element = element.parent(); } } + return null; + } + private String handleTimeExtractionFailure(Element contentElement) throws Exception { try { return getDate(contentElement); } catch (Exception ex) { - throw new Exception("time not found"); + throw new Exception("Time not found"); } - } protected String getDate(Element contentElement) throws Exception { @@ -259,24 +253,6 @@ protected String getDate(Element contentElement) throws Exception { throw new Exception("date not found"); } - protected double strSim(String a, String b) { - int len1 = a.length(); - int len2 = b.length(); - if (len1 == 0 || len2 == 0) { - return 0; - } - double ratio; - if (len1 > len2) { - ratio = (len1 + 0.0) / len2; - } else { - ratio = (len2 + 0.0) / len1; - } - if (ratio >= 3) { - return 0; - } - return (lcs(a, b) + 0.0) / Math.max(len1, len2); - } - protected String getTitle(final Element contentElement) throws Exception { final ArrayList titleList = new ArrayList(); final ArrayList titleSim = new ArrayList(); @@ -295,7 +271,7 @@ public void head(Node node, int i) { String tagName = tag.tagName(); if (Pattern.matches("h[1-6]", tagName)) { String title = tag.text().trim(); - double sim = strSim(title, metaTitle); + double sim = textAnalyzer.strSim(title, metaTitle); titleSim.add(sim); titleList.add(tag); } @@ -351,7 +327,7 @@ public void head(Node node, int i) { if (node instanceof TextNode) { TextNode tn = (TextNode) node; String text = tn.text().trim(); - double sim = strSim(text, metaTitle); + double sim = textAnalyzer.strSim(text, metaTitle); if (sim > 0) { if (sim > max.get(0)) { max.set(0, sim); @@ -396,42 +372,6 @@ protected int lcs(String x, String y) { } - protected int editDistance(String word1, String word2) { - int len1 = word1.length(); - int len2 = word2.length(); - - int[][] dp = new int[len1 + 1][len2 + 1]; - - for (int i = 0; i <= len1; i++) { - dp[i][0] = i; - } - - for (int j = 0; j <= len2; j++) { - dp[0][j] = j; - } - - for (int i = 0; i < len1; i++) { - char c1 = word1.charAt(i); - for (int j = 0; j < len2; j++) { - char c2 = word2.charAt(j); - - if (c1 == c2) { - dp[i + 1][j + 1] = dp[i][j]; - } else { - int replace = dp[i][j] + 1; - int insert = dp[i][j + 1] + 1; - int delete = dp[i + 1][j] + 1; - - int min = replace > insert ? insert : replace; - min = delete > min ? min : delete; - dp[i + 1][j + 1] = min; - } - } - } - - return dp[len1][len2]; - } - /*输入Jsoup的Document,获取正文所在Element*/ public static Element getContentElementByDoc(Document doc) throws Exception { ContentExtractor ce = new ContentExtractor(doc); @@ -485,24 +425,6 @@ public static String getContentByUrl(String url) throws Exception { return getContentByHtml(html, url); } - /*输入Jsoup的Document,获取结构化新闻信息*/ - public static News getNewsByDoc(Document doc) throws Exception { - ContentExtractor ce = new ContentExtractor(doc); - return ce.getNews(); - } - - /*输入HTML,获取结构化新闻信息*/ - public static News getNewsByHtml(String html) throws Exception { - Document doc = Jsoup.parse(html); - return getNewsByDoc(doc); - } - - /*输入HTML和URL,获取结构化新闻信息*/ - public static News getNewsByHtml(String html, String url) throws Exception { - Document doc = Jsoup.parse(html, url); - return getNewsByDoc(doc); - } - /*输入URL,获取结构化新闻信息*/ public static News getNewsByUrl(String url) throws Exception { // HttpRequest request = new HttpRequest(url); diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java new file mode 100644 index 00000000..8706ed11 --- /dev/null +++ b/src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java @@ -0,0 +1,69 @@ +package cn.edu.hfut.dmic.contentextractor; + +import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class NewsExtractor { + News news=new News(); + + public static OkHttpRequester okHttpRequester = new OkHttpRequester(); + private static final Logger LOG = LoggerFactory.getLogger(NewsExtractor.class); + + private Document doc; + private ContentExtractor contentExtractor; + + public NewsExtractor(Document doc) { + this.contentExtractor = new ContentExtractor(doc); + } + + public News getNews() throws Exception { + News news = new News(); + Element contentElement; + try { + contentElement = contentExtractor.getContentElement(); + news.setContentElement(contentElement); + } catch (Exception ex) { + LOG.info("news content extraction failed,extraction abort", ex); + throw new Exception(ex); + } + + if (doc.baseUri() != null) { + news.setUrl(doc.baseUri()); + } + + try { + news.setTime(contentExtractor.getTime(contentElement)); + } catch (Exception ex) { + LOG.info("news title extraction failed", ex); + } + + try { + news.setTitle(contentExtractor.getTitle(contentElement)); + } catch (Exception ex) { + LOG.info("title extraction failed", ex); + } + return news; + } + + public static News getNewsByDoc(Document doc) throws Exception { + NewsExtractor newsExtractor=new NewsExtractor(doc); + return newsExtractor.getNews(); + } + + /*输入HTML,获取结构化新闻信息*/ + public static News getNewsByHtml(String html) throws Exception { + Document doc = Jsoup.parse(html); + return getNewsByDoc(doc); + } + + /*输入HTML和URL,获取结构化新闻信息*/ + public static News getNewsByHtml(String html, String url) throws Exception { + Document doc = Jsoup.parse(html, url); + return getNewsByDoc(doc); + } + +} diff --git a/src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java b/src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java new file mode 100644 index 00000000..05331e3d --- /dev/null +++ b/src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java @@ -0,0 +1,78 @@ +package cn.edu.hfut.dmic.contentextractor; + +public class TextAnalyzer { + protected double strSim(String a, String b) { + int len1 = a.length(); + int len2 = b.length(); + if (len1 == 0 || len2 == 0) { + return 0; + } + double ratio; + if (len1 > len2) { + ratio = (len1 + 0.0) / len2; + } else { + ratio = (len2 + 0.0) / len1; + } + if (ratio >= 3) { + return 0; + } + return (lcs(a, b) + 0.0) / Math.max(len1, len2); + } + + protected int lcs(String x, String y) { + + int M = x.length(); + int N = y.length(); + if (M == 0 || N == 0) { + return 0; + } + int[][] opt = new int[M + 1][N + 1]; + + for (int i = M - 1; i >= 0; i--) { + for (int j = N - 1; j >= 0; j--) { + if (x.charAt(i) == y.charAt(j)) { + opt[i][j] = opt[i + 1][j + 1] + 1; + } else { + opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]); + } + } + } + return opt[0][0]; + } + + protected int editDistance(String word1, String word2) { + int len1 = word1.length(); + int len2 = word2.length(); + + int[][] dp = new int[len1 + 1][len2 + 1]; + + for (int i = 0; i <= len1; i++) { + dp[i][0] = i; + } + + for (int j = 0; j <= len2; j++) { + dp[0][j] = j; + } + + for (int i = 0; i < len1; i++) { + char c1 = word1.charAt(i); + for (int j = 0; j < len2; j++) { + char c2 = word2.charAt(j); + + if (c1 == c2) { + dp[i + 1][j + 1] = dp[i][j]; + } else { + int replace = dp[i][j] + 1; + int insert = dp[i][j + 1] + 1; + int delete = dp[i + 1][j] + 1; + + int min = replace > insert ? insert : replace; + min = delete > min ? min : delete; + dp[i + 1][j + 1] = min; + } + } + } + + return dp[len1][len2]; + } +} diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java b/src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java index 42b2f9e7..3d743019 100644 --- a/src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/fetcher/VisitorMethodDispatcher.java @@ -1,6 +1,7 @@ package cn.edu.hfut.dmic.webcollector.fetcher; import cn.edu.hfut.dmic.webcollector.conf.DefaultConfigured; +import cn.edu.hfut.dmic.webcollector.util.RegexRule; import cn.edu.hfut.dmic.webcollector.model.CrawlDatums; import cn.edu.hfut.dmic.webcollector.model.Links; import cn.edu.hfut.dmic.webcollector.model.Page; @@ -242,7 +243,9 @@ protected void parseLink(Page page, CrawlDatums next) { if (conteType != null && conteType.contains("text/html")) { Document doc = page.doc(); if (doc != null) { - Links links = new Links().addByRegex(doc, regexRule, getConf().getAutoDetectImg()); + RegexRule regexRule = new RegexRule(); // Presuming you have set the regex patterns somewhere + + Links links = regexRule.addByRegex(doc, regexRule, getConf().getAutoDetectImg()); next.add(links); } } @@ -267,8 +270,7 @@ protected void parseLink(Page page, CrawlDatums next) { // } // }; // -// VisitorMethodDispatcher visitorMethodDispatcher = new VisitorMethodDispatcher(visitor); -// + // // } // // diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatum.java b/src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatum.java index 0a33c5eb..5e3a3554 100644 --- a/src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatum.java +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/model/CrawlDatum.java @@ -16,10 +16,9 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package cn.edu.hfut.dmic.webcollector.model; - -import cn.edu.hfut.dmic.webcollector.util.CrawlDatumFormater; import cn.edu.hfut.dmic.webcollector.util.GsonUtils; import cn.edu.hfut.dmic.webcollector.util.RegexRule; +import cn.edu.hfut.dmic.webcollector.util.StatusFormatter; import com.google.gson.*; import java.io.Serializable; @@ -65,6 +64,17 @@ public class CrawlDatum implements Serializable, MetaGetter, MetaSetter iterator() { return dataList.iterator(); } - public Links filterByRegex(RegexRule regexRule) { - Iterator ite = iterator(); - while(ite.hasNext()){ - String url = ite.next(); - if (!regexRule.satisfy(url)) { - ite.remove(); - } - } - return this; - } - - public Links filterByRegex(String regex) { - RegexRule regexRule = new RegexRule(); - regexRule.addRule(regex); - return filterByRegex(regexRule); - } +// public Links filterByRegex(RegexRule regexRule) { +// Iterator ite = iterator(); +// while(ite.hasNext()){ +// String url = ite.next(); +// if (!regexRule.satisfy(url)) { +// ite.remove(); +// } +// } +// return this; +// } +// +// public Links filterByRegex(String regex) { +// RegexRule regexRule = new RegexRule(); +// regexRule.addRule(regex); +// return filterByRegex(regexRule); +// } public Links addFromElement(Element ele) { addFromElement(ele,false); @@ -131,34 +131,34 @@ public Links addBySelector(Element ele, String cssSelector){ return addBySelector(ele ,cssSelector,false); } - public Links addByRegex(Element ele, RegexRule regexRule, boolean parseSrc) { - for(String href: ele.select("a[href]").eachAttr("abs:href")){ - if (regexRule.satisfy(href)) { - this.add(href); - } - } - if(parseSrc) { - for (String src : ele.select("*[src]").eachAttr("abs:src")){ - if(regexRule.satisfy(src)){ - this.add(src); - } - } - } - return this; - } - - public Links addByRegex(Element ele, RegexRule regexRule) { - return addByRegex(ele, regexRule, false); - } - - public Links addByRegex(Element ele, String regex, boolean parseSrc) { - RegexRule regexRule = new RegexRule(regex); - return addByRegex(ele, regexRule, parseSrc); - } - public Links addByRegex(Element ele, String regex) { - RegexRule regexRule = new RegexRule(regex); - return addByRegex(ele,regexRule,false); - } +// public Links addByRegex(Element ele, RegexRule regexRule, boolean parseSrc) { +// for(String href: ele.select("a[href]").eachAttr("abs:href")){ +// if (regexRule.satisfy(href)) { +// this.add(href); +// } +// } +// if(parseSrc) { +// for (String src : ele.select("*[src]").eachAttr("abs:src")){ +// if(regexRule.satisfy(src)){ +// this.add(src); +// } +// } +// } +// return this; +// } + +// public Links addByRegex(Element ele, RegexRule regexRule) { +// return addByRegex(ele, regexRule, false); +// } +// +// public Links addByRegex(Element ele, String regex, boolean parseSrc) { +// RegexRule regexRule = new RegexRule(regex); +// return addByRegex(ele, regexRule, parseSrc); +// } +// public Links addByRegex(Element ele, String regex) { +// RegexRule regexRule = new RegexRule(regex); +// return addByRegex(ele,regexRule,false); +// } diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/model/Page.java b/src/main/java/cn/edu/hfut/dmic/webcollector/model/Page.java index 38fd1f52..e77726e0 100644 --- a/src/main/java/cn/edu/hfut/dmic/webcollector/model/Page.java +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/model/Page.java @@ -16,6 +16,7 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package cn.edu.hfut.dmic.webcollector.model; +import cn.edu.hfut.dmic.webcollector.util.RegexRule; import cn.edu.hfut.dmic.webcollector.util.CharsetDetector; import java.io.UnsupportedEncodingException; @@ -189,10 +190,11 @@ public Links links(String cssSelector) { public Links regexLinks(RegexRule regexRule, boolean parseSrc) { - return new Links().addByRegex(doc(), regexRule, parseSrc); + return regexRule.addByRegex(doc(), regexRule, parseSrc); } public Links regexLinks(String regex, boolean parseSrc){ - return new Links().addByRegex(doc(),regex,parseSrc); + RegexRule regexRule=new RegexRule(); + return regexRule.addByRegex(doc(),regex,parseSrc); } public Links regexLinks(RegexRule regexRule) { diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/CharsetDetector.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/CharsetDetector.java index 38816d65..5fc4395b 100644 --- a/src/main/java/cn/edu/hfut/dmic/webcollector/util/CharsetDetector.java +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/CharsetDetector.java @@ -67,22 +67,36 @@ private static String guessEncodingByNutch(byte[] content) { } } if (encoding == null) { - if (length >= 3 && content[0] == (byte) 0xEF - && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) { + if (isUTF8Signature(content, length)) { encoding = "UTF-8"; - } else if (length >= 2) { - if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) { - encoding = "UTF-16LE"; - } else if (content[0] == (byte) 0xFE - && content[1] == (byte) 0xFF) { - encoding = "UTF-16BE"; - } + } else if (isUTF16LESignature(content, length)) { + encoding = "UTF-16LE"; + } else if (isUTF16BESignature(content, length)) { + encoding = "UTF-16BE"; } } - return encoding; } + private static boolean isUTF8Signature(byte[] content, int length) { + return length >= 3 + && content[0] == (byte) 0xEF + && content[1] == (byte) 0xBB + && content[2] == (byte) 0xBF; + } + + private static boolean isUTF16LESignature(byte[] content, int length) { + return length >= 2 + && content[0] == (byte) 0xFF + && content[1] == (byte) 0xFE; + } + + private static boolean isUTF16BESignature(byte[] content, int length) { + return length >= 2 + && content[0] == (byte) 0xFE + && content[1] == (byte) 0xFF; + } + /** * 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8 * diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/CrawlDatumFormater.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/CrawlDatumFormater.java index 2c32bf99..377d279b 100644 --- a/src/main/java/cn/edu/hfut/dmic/webcollector/util/CrawlDatumFormater.java +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/CrawlDatumFormater.java @@ -42,18 +42,7 @@ public static String datumToString(CrawlDatum datum) { .append("\nURL: ").append(datum.url()) .append("\nSTATUS: "); - switch (datum.getStatus()) { - case CrawlDatum.STATUS_DB_SUCCESS: - sb.append("success"); - break; - case CrawlDatum.STATUS_DB_FAILED: - sb.append("failed"); - break; - case CrawlDatum.STATUS_DB_UNEXECUTED: - sb.append("unexecuted"); - break; - } - + String status = getStatusFormatter(datum.getStatus()).format(datum); sb.append("\nExecuteTime: ") .append(sdf.format(new Date(datum.getExecuteTime()))) .append("\nExecuteCount: ").append(datum.getExecuteCount()) @@ -76,6 +65,19 @@ public static String datumToString(CrawlDatum datum) { return sb.toString(); } + private static StatusFormatter getStatusFormatter(int status) { + switch (status) { + case CrawlDatum.STATUS_DB_SUCCESS: + return new SuccessStatusFormatter(); + case CrawlDatum.STATUS_DB_FAILED: + return new FailedStatusFormatter(); + case CrawlDatum.STATUS_DB_UNEXECUTED: + return new UnexecutedStatusFormatter(); + default: + throw new IllegalArgumentException("Unknown status: " + status); + } + } + // public static CrawlDatum jsonStrToDatum(String crawlDatumKey, String jsonStr) { // JsonArray jsonArray = GsonUtils.parse(jsonStr).getAsJsonArray(); // diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/FailedStatusFormatter.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/FailedStatusFormatter.java new file mode 100644 index 00000000..efa5b776 --- /dev/null +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/FailedStatusFormatter.java @@ -0,0 +1,9 @@ +package cn.edu.hfut.dmic.webcollector.util; + +import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; + +public class FailedStatusFormatter implements StatusFormatter{ + public String format(CrawlDatum datum) { + return "failed"; + } +} diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/RegexRule.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/RegexRule.java index ea212d37..74f3b2bb 100644 --- a/src/main/java/cn/edu/hfut/dmic/webcollector/util/RegexRule.java +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/RegexRule.java @@ -18,16 +18,24 @@ package cn.edu.hfut.dmic.webcollector.util; +import cn.edu.hfut.dmic.webcollector.model.Links; +import org.jsoup.nodes.Element; + import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; +import static java.util.Spliterators.iterator; + /** * * @author hu */ public class RegexRule { - + + Links links=new Links(); + public RegexRule(){ } @@ -129,6 +137,62 @@ public boolean satisfy(String str) { } else { return true; } + } + + + public Links filterByRegex(String regex) { + RegexRule regexRule = new RegexRule(); + regexRule.addRule(regex); + return filterByRegex(regexRule); + } + + public Links filterByRegex(RegexRule regexRule) { + Iterator ite = iterator(); + while(ite.hasNext()){ + String url = ite.next(); + if (!regexRule.satisfy(url)) { + ite.remove(); + } + } + return links; + } + + private Iterator iterator() { + return null; + } + + + + public Links addByRegex(Element ele, String regex, boolean parseSrc) { + RegexRule regexRule = new RegexRule(regex); + return addByRegex(ele, regexRule, parseSrc); + } + + public Links addByRegex(Element ele, String regex) { + RegexRule regexRule = new RegexRule(regex); + return addByRegex(ele,regexRule,false); + } + public Links addByRegex(Element ele, RegexRule regexRule) { + return addByRegex(ele, regexRule, false); + } + + public Links addByRegex(Element ele, RegexRule regexRule, boolean parseSrc) { + // This method should use the regexRule parameter to add links to the Links object. + Links links = new Links(); + for (String href : ele.select("a[href]").eachAttr("abs:href")) { + if (regexRule.satisfy(href)) { + links.add(href); + } + } + if (parseSrc) { + for (String src : ele.select("img[src]").eachAttr("abs:src")) { + if (regexRule.satisfy(src)) { + links.add(src); + } + } + } + return links; } } + diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/StatusFormatter.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/StatusFormatter.java new file mode 100644 index 00000000..25f06706 --- /dev/null +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/StatusFormatter.java @@ -0,0 +1,7 @@ +package cn.edu.hfut.dmic.webcollector.util; + +import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; + +public interface StatusFormatter { + String format(CrawlDatum datum); +} diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/SuccessStatusFormatter.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/SuccessStatusFormatter.java new file mode 100644 index 00000000..17df2953 --- /dev/null +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/SuccessStatusFormatter.java @@ -0,0 +1,9 @@ +package cn.edu.hfut.dmic.webcollector.util; + +import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; + +public class SuccessStatusFormatter implements StatusFormatter{ + public String format(CrawlDatum datum) { + return "success"; + } +} diff --git a/src/main/java/cn/edu/hfut/dmic/webcollector/util/UnexecutedStatusFormatter.java b/src/main/java/cn/edu/hfut/dmic/webcollector/util/UnexecutedStatusFormatter.java new file mode 100644 index 00000000..2c96e3f5 --- /dev/null +++ b/src/main/java/cn/edu/hfut/dmic/webcollector/util/UnexecutedStatusFormatter.java @@ -0,0 +1,9 @@ +package cn.edu.hfut.dmic.webcollector.util; + +import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; + +public class UnexecutedStatusFormatter implements StatusFormatter{ + public String format(CrawlDatum datum) { + return "unexecuted"; + } +}