Skip to content

Commit

Permalink
Improve summary handling
Browse files Browse the repository at this point in the history
Improve cleaning of content in parentheses
Improve whitespace removal
Fix generation of follow up summary
  • Loading branch information
krisgesling committed Oct 20, 2021
1 parent cbdee66 commit 395b7f0
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 7 deletions.
2 changes: 1 addition & 1 deletion wiki/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@
DisambiguationError
)

from .pymediawiki import Wiki
from .pymediawiki import Wiki
15 changes: 9 additions & 6 deletions wiki/pymediawiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
)

from mycroft.util import LOG
from .util import remove_nested_parentheses


DEFAULT_IMAGE = 'ui/default-images/wikipedia-logo.svg'
Expand Down Expand Up @@ -146,11 +147,12 @@ def get_summary_intro(self, page: MediaWikiPage) -> tuple([str, int]):
"""
length = 20 if self.auto_more else 2
answer = self.summarize_page(page, sentences=length)
if not self.auto_more and len(answer) > 250:
answer = self.summarize_page(page, sentences=1)
if not self.auto_more and len(answer) > 300:
length = 1
answer = self.summarize_page(page, sentences=length)
return answer, length

def get_summary_next_lines(self, page: MediaWikiPage, previous_lines: int, num_lines: int = 5) -> tuple([str, int]):
def get_summary_next_lines(self, page: MediaWikiPage, previous_lines: int = 2, num_lines: int = 5) -> tuple([str, int]):
"""Get the next summary lines to be read.
Args:
Expand All @@ -162,7 +164,7 @@ def get_summary_next_lines(self, page: MediaWikiPage, previous_lines: int, num_l
total length of summary read so far ie previous_lines + num_lines
"""
total_summary_read = previous_lines + num_lines
previously_read = page.summarize(sentences=previous_lines)
previously_read = self.summarize_page(page, sentences=previous_lines)
next_summary_section = self.summarize_page(
page, sentences=total_summary_read).replace(previously_read, '')
return next_summary_section, total_summary_read
Expand Down Expand Up @@ -214,6 +216,7 @@ def summarize_page(self, page: MediaWikiPage, sentences: int) -> str:
sentences: number of sentences to return
"""
pymediawiki_summary = page.summarize(sentences=sentences)
cleaned_text = re.sub(
"\(.*?\)", "", pymediawiki_summary).replace(' ', ' ')
cleaned_text = remove_nested_parentheses(pymediawiki_summary)
# remove white spaces
cleaned_text = " ".join(cleaned_text.split())
return cleaned_text
30 changes: 30 additions & 0 deletions wiki/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2021, Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

def remove_nested_parentheses(input: str) -> str:
"""Remove content contained within parentheses from a string.
This includes content that is nested within multiple sets, eg:
Lemurs (/ˈliːmər/ (listen) LEE-mər)
"""
ret = ''
nest_depth = 0
for char in input:
if char == '(':
nest_depth += 1
elif (char == ')') and nest_depth:
nest_depth -= 1
elif not nest_depth:
ret += char
return ret

0 comments on commit 395b7f0

Please sign in to comment.