Skip to content

Commit 5ec9486

Browse files
authored
perf: Enhance Word parsing (#2612)
1 parent 263c18e commit 5ec9486

File tree

2 files changed

+41
-10
lines changed

2 files changed

+41
-10
lines changed

apps/common/handle/impl/doc_split_handle.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,24 +110,51 @@ def get_image_id(image_id):
110110
return get_image_id
111111

112112

113+
title_font_list = [
114+
[36, 100],
115+
[26, 36],
116+
[24, 26],
117+
[22, 24],
118+
[18, 22],
119+
[16, 18]
120+
]
121+
122+
123+
def get_title_level(paragraph: Paragraph):
124+
try:
125+
if paragraph.style is not None:
126+
psn = paragraph.style.name
127+
if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
128+
return int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
129+
''))
130+
if len(paragraph.runs) == 1:
131+
font_size = paragraph.runs[0].font.size
132+
pt = font_size.pt
133+
if pt >= 16:
134+
for _value, index in zip(title_font_list, range(len(title_font_list))):
135+
if pt >= _value[0] and pt < _value[1]:
136+
return index + 1
137+
except Exception as e:
138+
pass
139+
return None
140+
141+
113142
class DocSplitHandle(BaseSplitHandle):
114143
@staticmethod
115144
def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
116145
try:
117-
psn = paragraph.style.name
118-
if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
119-
title = "".join(["#" for i in range(
120-
int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
121-
'')))]) + " " + paragraph.text
146+
title_level = get_title_level(paragraph)
147+
if title_level is not None:
148+
title = "".join(["#" for i in range(title_level)]) + " " + paragraph.text
122149
images = reduce(lambda x, y: [*x, *y],
123150
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
124151
paragraph._element],
125152
[])
126-
127153
if len(images) > 0:
128154
return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
129155
paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
130156
return title
157+
131158
except Exception as e:
132159
traceback.print_exc()
133160
return paragraph.text

apps/common/util/split_model.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -339,13 +339,14 @@ def parse(self, text: str):
339339
for e in result:
340340
if len(e['content']) > 4096:
341341
pass
342-
return [item for item in [self.post_reset_paragraph(row) for row in result] if
342+
title_list = list(set([row.get('title') for row in result]))
343+
return [item for item in [self.post_reset_paragraph(row, title_list) for row in result] if
343344
'content' in item and len(item.get('content').strip()) > 0]
344345

345-
def post_reset_paragraph(self, paragraph: Dict):
346+
def post_reset_paragraph(self, paragraph: Dict, title_list: List[str]):
346347
result = self.filter_title_special_characters(paragraph)
347348
result = self.sub_title(result)
348-
result = self.content_is_null(result)
349+
result = self.content_is_null(result, title_list)
349350
return result
350351

351352
@staticmethod
@@ -357,11 +358,14 @@ def sub_title(paragraph: Dict):
357358
return paragraph
358359

359360
@staticmethod
360-
def content_is_null(paragraph: Dict):
361+
def content_is_null(paragraph: Dict, title_list: List[str]):
361362
if 'title' in paragraph:
362363
title = paragraph.get('title')
363364
content = paragraph.get('content')
364365
if (content is None or len(content.strip()) == 0) and (title is not None and len(title) > 0):
366+
find = [t for t in title_list if t.__contains__(title) and t != title]
367+
if find:
368+
return {'title': '', 'content': ''}
365369
return {'title': '', 'content': title}
366370
return paragraph
367371

0 commit comments

Comments
 (0)