Merge pull request dmmiller612#42 from ColdTeapot273K/master

Coreference resolution and other fixes
Bookscribs-io · Mar 30, 2020 · ff98f95 · ff98f95
2 parents 00b6435 + 3301f09
commit ff98f95
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ This repo is the generalization of the lecture-summarizer repo. This tool utiliz
 to run extractive summarizations. This works by first embedding the sentences, then running a clustering algorithm, finding 
 the sentences that are closest to the cluster's centroids. This library also uses coreference techniques, utilizing the 
 https://github.com/huggingface/neuralcoref library to resolve words in summaries that need more context. The greedyness of 
-the neuralcoref library can be tweaked in the SingleModel class.
+the neuralcoref library can be tweaked in the CoreferenceHandler class.
 
 Paper: https://arxiv.org/abs/1906.04165
 
@@ -18,10 +18,19 @@ Paper: https://arxiv.org/abs/1906.04165
 pip install bert-extractive-summarizer
 ```
 
-#### NOTE: If you are using coreference, you will need spacy 2.1.3 installed. There is currently an issue with Spacy 2.1.4 that produces segmentation faults. 
+#### We use spaCy 2.1.3 by default to support neuralcoref 4.0 (there is currently an issue with Spacy 2.1.4 that produces segmentation faults). If you want to use the latest spaCy, you'll either have to build neuralcoref 4.0 from source ([details](https://github.com/huggingface/neuralcoref/issues/197)) or don't use coreference resolution at all.
 ```bash
-pip install spacy
-pip install transformers==2.2.0
+pip install spacy==2.1.3
+pip install transformers==2.2.2
+pip install neuralcoref
+```
+
+#### Coreference functionality with neuralcoref requires a spaCy model, which has to be downloaded separately. 
+The default model is small English spaCy model (en_core_web_sm, 11Mb) and is installed automaticaly with this package. To use other model you'll have to install it manually.
+
+Example: installing medium (91 Mb) English model (for more models see [spaCy documentation](https://spacy.io/usage/models)). 
+```bash
+python -m spacy download en_core_web_md
 ```
 
 ## How to Use
@@ -37,6 +46,42 @@ model(body)
 model(body2)
 ```
 
+#### Simple Example with coreference
+```python
+from summarizer import Summarizer
+from summarizer.coreference_handler import CoreferenceHandler
+
+handler = CoreferenceHandler(greedyness=.4)
+# How coreference works:
+# >>>handler.process('''My sister has a dog. She loves him.''', min_length=2)
+# ['My sister has a dog.', 'My sister loves a dog.']
+
+body = 'Text body that you want to summarize with BERT'
+body2 = 'Something else you want to summarize with BERT'
+model = Summarizer(sentence_handler=handler)
+model(body)
+model(body2)
+```
+
+#### Simple Example with custom model (we alwsys have to set output_hidden_states=True in model config)
+```python
+from transformers import *
+
+# Load model, model config and tokenizer via Transformers
+custom_config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased')
+custom_config.output_hidden_states=True
+custom_tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
+custom_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', config=custom_config)
+
+from summarizer import Summarizer
+
+body = 'Text body that you want to summarize with BERT'
+body2 = 'Something else you want to summarize with BERT'
+model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
+model(body)
+model(body2)
+```
+
 #### Large Example
 
 ```python
@@ -89,6 +134,7 @@ model = Summarizer(
     custom_tokenizer: Custom tokenizer can be supplied here,
     reduce_option: str # It can be 'mean', 'median', or 'max'. This reduces the embedding layer for pooling.
     greedyness: float # number between 0 and 1. It is used for the coreference model. Anywhere from 0.35 to 0.45 seems to work well.
+    sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass CoreferenceHandler instance
 )
 
 model(

diff --git a/requirements-service.txt b/requirements-service.txt
@@ -10,4 +10,5 @@ scikit-learn
 bert-extractive-summarizer==0.2.2
 Flask
 flask-cors
-nltk
+nltk
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,11 @@
 numpy==1.16.3
 torch==1.0.1
-spacy
+spacy==2.1.3
 transformers==2.2.2
 Cython==0.29.10
 tqdm==4.32.2
 neuralcoref==4.0
 argparse
 scikit-learn
-pytest
+pytest
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
diff --git a/summarizer/coreference_handler.py b/summarizer/coreference_handler.py
@@ -1,12 +1,16 @@
-from spacy.lang.en import English
+# remoced previous import and related functionality since it's just a blank language model,
+#  while neuralcoref requires passing pretrained language model via spacy.load()
+
 import neuralcoref
 from summarizer.sentence_handler import SentenceHandler
 
+import spacy
+
 
 class CoreferenceHandler(SentenceHandler):
 
-    def __init__(self, language = English, greedyness: float = 0.45):
-        super().__init__(language)
+    def __init__(self, spacy_model: str = 'en_core_web_sm', greedyness: float = 0.45):
+        self.nlp = spacy.load(spacy_model)
         neuralcoref.add_to_pipe(self.nlp, greedyness=greedyness)
 
     def process(self, body: str, min_length: int = 40, max_length: int = 600):

diff --git a/summarizer/model_processors.py b/summarizer/model_processors.py
@@ -27,7 +27,7 @@ def __init__(
         :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
         :param hidden: This signifies which layer of the BERT model you would like to use as embeddings.
         :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
-        :param sentence_handler: The handler to process sentences.
+        :param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass CoreferenceHandler instance
         :param random_state: The random state to reproduce summarizations.
         """
 

diff --git a/tests/test_summary_items.py b/tests/test_summary_items.py
@@ -1,5 +1,6 @@
 import pytest
 from summarizer import Summarizer, TransformerSummarizer
+from summarizer.coreference_handler import CoreferenceHandler
 from transformers import AlbertTokenizer, AlbertModel
 
 
@@ -19,6 +20,10 @@ def albert_transformer():
 def summarizer():
     return Summarizer('distilbert-base-uncased')
 
+@pytest.fixture()
+def coreference_handler():
+    return CoreferenceHandler()
+
 
 @pytest.fixture()
 def passage():
@@ -74,4 +79,10 @@ def test_albert(custom_summarizer, passage):
 
 def test_transformer_clz(albert_transformer, passage):
     res = albert_transformer(passage)
-    assert len(res) > 10
+    assert len(res) > 10
+
+def test_coreference_handler(coreference_handler):
+    orig = '''My sister has a dog. She loves him.'''
+    resolved = '''My sister has a dog. My sister loves a dog.'''
+    result = coreference_handler.process(orig, min_length=2)
+    assert ' '.join(result) == resolved