A unix-like utility for filtering raw sentences depending on their post-encoded lengths.
git clone [email protected]:erip/spm_filter.git
cd spm_filter
pip install -e .
# Read from stdin by default
cat sents.txt | spm-filter -m /path/to/sentencepiece.model --max-len 256 > filtered.txt
# Or read from a file
spm-filter -i sents.txt -m /path/to/sentencepiece.model --max-len 256 > filtered2.txt