Skip to content

refactor: design and implementation smells #137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.6</maven.compiler.source>
<maven.compiler.target>1.6</maven.compiler.target>
<maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>21</maven.compiler.target>
</properties>

<licenses>
Expand Down
142 changes: 32 additions & 110 deletions src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static cn.edu.hfut.dmic.contentextractor.NewsExtractor.getNewsByHtml;


/**
* ContentExtractor could extract content,title,time from news webpage
Expand All @@ -50,6 +52,8 @@ public class ContentExtractor {
this.doc = doc;
}

TextAnalyzer textAnalyzer=new TextAnalyzer();

protected HashMap<Element, CountInfo> infoMap = new HashMap<Element, CountInfo>();

class CountInfo {
Expand Down Expand Up @@ -122,7 +126,11 @@ protected CountInfo computeInfo(Node node) {
protected double computeScore(Element tag) {
CountInfo countInfo = infoMap.get(tag);
double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
// double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
double logVar = Math.log(var);
double adjustedTextCount = Math.log(countInfo.textCount - countInfo.linkTextCount + 1);
double adjustedPCount = Math.log10(countInfo.pCount + 2);
double score = logVar * countInfo.densitySum * adjustedTextCount * adjustedPCount;
return score;
}

Expand Down Expand Up @@ -168,38 +176,16 @@ public Element getContentElement() throws Exception {
return content;
}

public News getNews() throws Exception {
News news = new News();
Element contentElement;
try {
contentElement = getContentElement();
news.setContentElement(contentElement);
} catch (Exception ex) {
LOG.info("news content extraction failed,extraction abort", ex);
throw new Exception(ex);
}

if (doc.baseUri() != null) {
news.setUrl(doc.baseUri());
}

try {
news.setTime(getTime(contentElement));
} catch (Exception ex) {
LOG.info("news title extraction failed", ex);
}

try {
news.setTitle(getTitle(contentElement));
} catch (Exception ex) {
LOG.info("title extraction failed", ex);
protected String getTime(Element contentElement) throws Exception {
Element searchedElement = climbDOMTree(contentElement);
String time = findTimeInElement(searchedElement);
if (time != null) {
return time;
}
return news;
return handleTimeExtractionFailure(contentElement);
}

protected String getTime(Element contentElement) throws Exception {
String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
Pattern pattern = Pattern.compile(regex);
private Element climbDOMTree(Element contentElement) {
Element current = contentElement;
for (int i = 0; i < 2; i++) {
if (current != null && current != doc.body()) {
Expand All @@ -209,26 +195,34 @@ protected String getTime(Element contentElement) throws Exception {
}
}
}
return current;
}

private String findTimeInElement(Element element) {
String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
Pattern pattern = Pattern.compile(regex);
for (int i = 0; i < 6; i++) {
if (current == null) {
if (element == null) {
break;
}
String currentHtml = current.outerHtml();
String currentHtml = element.outerHtml();
Matcher matcher = pattern.matcher(currentHtml);
if (matcher.find()) {
return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
}
if (current != doc.body()) {
current = current.parent();
if (element != doc.body()) {
element = element.parent();
}
}
return null;
}

private String handleTimeExtractionFailure(Element contentElement) throws Exception {
try {
return getDate(contentElement);
} catch (Exception ex) {
throw new Exception("time not found");
throw new Exception("Time not found");
}

}

protected String getDate(Element contentElement) throws Exception {
Expand Down Expand Up @@ -259,24 +253,6 @@ protected String getDate(Element contentElement) throws Exception {
throw new Exception("date not found");
}

protected double strSim(String a, String b) {
int len1 = a.length();
int len2 = b.length();
if (len1 == 0 || len2 == 0) {
return 0;
}
double ratio;
if (len1 > len2) {
ratio = (len1 + 0.0) / len2;
} else {
ratio = (len2 + 0.0) / len1;
}
if (ratio >= 3) {
return 0;
}
return (lcs(a, b) + 0.0) / Math.max(len1, len2);
}

protected String getTitle(final Element contentElement) throws Exception {
final ArrayList<Element> titleList = new ArrayList<Element>();
final ArrayList<Double> titleSim = new ArrayList<Double>();
Expand All @@ -295,7 +271,7 @@ public void head(Node node, int i) {
String tagName = tag.tagName();
if (Pattern.matches("h[1-6]", tagName)) {
String title = tag.text().trim();
double sim = strSim(title, metaTitle);
double sim = textAnalyzer.strSim(title, metaTitle);
titleSim.add(sim);
titleList.add(tag);
}
Expand Down Expand Up @@ -351,7 +327,7 @@ public void head(Node node, int i) {
if (node instanceof TextNode) {
TextNode tn = (TextNode) node;
String text = tn.text().trim();
double sim = strSim(text, metaTitle);
double sim = textAnalyzer.strSim(text, metaTitle);
if (sim > 0) {
if (sim > max.get(0)) {
max.set(0, sim);
Expand Down Expand Up @@ -396,42 +372,6 @@ protected int lcs(String x, String y) {

}

protected int editDistance(String word1, String word2) {
int len1 = word1.length();
int len2 = word2.length();

int[][] dp = new int[len1 + 1][len2 + 1];

for (int i = 0; i <= len1; i++) {
dp[i][0] = i;
}

for (int j = 0; j <= len2; j++) {
dp[0][j] = j;
}

for (int i = 0; i < len1; i++) {
char c1 = word1.charAt(i);
for (int j = 0; j < len2; j++) {
char c2 = word2.charAt(j);

if (c1 == c2) {
dp[i + 1][j + 1] = dp[i][j];
} else {
int replace = dp[i][j] + 1;
int insert = dp[i][j + 1] + 1;
int delete = dp[i + 1][j] + 1;

int min = replace > insert ? insert : replace;
min = delete > min ? min : delete;
dp[i + 1][j + 1] = min;
}
}
}

return dp[len1][len2];
}

/*输入Jsoup的Document,获取正文所在Element*/
public static Element getContentElementByDoc(Document doc) throws Exception {
ContentExtractor ce = new ContentExtractor(doc);
Expand Down Expand Up @@ -485,24 +425,6 @@ public static String getContentByUrl(String url) throws Exception {
return getContentByHtml(html, url);
}

/*输入Jsoup的Document,获取结构化新闻信息*/
public static News getNewsByDoc(Document doc) throws Exception {
ContentExtractor ce = new ContentExtractor(doc);
return ce.getNews();
}

/*输入HTML,获取结构化新闻信息*/
public static News getNewsByHtml(String html) throws Exception {
Document doc = Jsoup.parse(html);
return getNewsByDoc(doc);
}

/*输入HTML和URL,获取结构化新闻信息*/
public static News getNewsByHtml(String html, String url) throws Exception {
Document doc = Jsoup.parse(html, url);
return getNewsByDoc(doc);
}

/*输入URL,获取结构化新闻信息*/
public static News getNewsByUrl(String url) throws Exception {
// HttpRequest request = new HttpRequest(url);
Expand Down
69 changes: 69 additions & 0 deletions src/main/java/cn/edu/hfut/dmic/contentextractor/NewsExtractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package cn.edu.hfut.dmic.contentextractor;

import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class NewsExtractor {
News news=new News();

public static OkHttpRequester okHttpRequester = new OkHttpRequester();
private static final Logger LOG = LoggerFactory.getLogger(NewsExtractor.class);

private Document doc;
private ContentExtractor contentExtractor;

public NewsExtractor(Document doc) {
this.contentExtractor = new ContentExtractor(doc);
}

public News getNews() throws Exception {
News news = new News();
Element contentElement;
try {
contentElement = contentExtractor.getContentElement();
news.setContentElement(contentElement);
} catch (Exception ex) {
LOG.info("news content extraction failed,extraction abort", ex);
throw new Exception(ex);
}

if (doc.baseUri() != null) {
news.setUrl(doc.baseUri());
}

try {
news.setTime(contentExtractor.getTime(contentElement));
} catch (Exception ex) {
LOG.info("news title extraction failed", ex);
}

try {
news.setTitle(contentExtractor.getTitle(contentElement));
} catch (Exception ex) {
LOG.info("title extraction failed", ex);
}
return news;
}

public static News getNewsByDoc(Document doc) throws Exception {
NewsExtractor newsExtractor=new NewsExtractor(doc);
return newsExtractor.getNews();
}

/*输入HTML,获取结构化新闻信息*/
public static News getNewsByHtml(String html) throws Exception {
Document doc = Jsoup.parse(html);
return getNewsByDoc(doc);
}

/*输入HTML和URL,获取结构化新闻信息*/
public static News getNewsByHtml(String html, String url) throws Exception {
Document doc = Jsoup.parse(html, url);
return getNewsByDoc(doc);
}

}
78 changes: 78 additions & 0 deletions src/main/java/cn/edu/hfut/dmic/contentextractor/TextAnalyzer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package cn.edu.hfut.dmic.contentextractor;

public class TextAnalyzer {
protected double strSim(String a, String b) {
int len1 = a.length();
int len2 = b.length();
if (len1 == 0 || len2 == 0) {
return 0;
}
double ratio;
if (len1 > len2) {
ratio = (len1 + 0.0) / len2;
} else {
ratio = (len2 + 0.0) / len1;
}
if (ratio >= 3) {
return 0;
}
return (lcs(a, b) + 0.0) / Math.max(len1, len2);
}

protected int lcs(String x, String y) {

int M = x.length();
int N = y.length();
if (M == 0 || N == 0) {
return 0;
}
int[][] opt = new int[M + 1][N + 1];

for (int i = M - 1; i >= 0; i--) {
for (int j = N - 1; j >= 0; j--) {
if (x.charAt(i) == y.charAt(j)) {
opt[i][j] = opt[i + 1][j + 1] + 1;
} else {
opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);
}
}
}
return opt[0][0];
}

protected int editDistance(String word1, String word2) {
int len1 = word1.length();
int len2 = word2.length();

int[][] dp = new int[len1 + 1][len2 + 1];

for (int i = 0; i <= len1; i++) {
dp[i][0] = i;
}

for (int j = 0; j <= len2; j++) {
dp[0][j] = j;
}

for (int i = 0; i < len1; i++) {
char c1 = word1.charAt(i);
for (int j = 0; j < len2; j++) {
char c2 = word2.charAt(j);

if (c1 == c2) {
dp[i + 1][j + 1] = dp[i][j];
} else {
int replace = dp[i][j] + 1;
int insert = dp[i][j + 1] + 1;
int delete = dp[i + 1][j] + 1;

int min = replace > insert ? insert : replace;
min = delete > min ? min : delete;
dp[i + 1][j + 1] = min;
}
}
}

return dp[len1][len2];
}
}
Loading