forked from lushl9301/PubMed-Text-Mining-Tool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreProcess.pl
68 lines (57 loc) · 1.84 KB
/
preProcess.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
use warnings;
use Lingua::EN::Sentence qw(get_sentences);
use Unicode::Normalize 'normalize';
require "splitFunction.pl";
binmode(STDOUT, ":utf8");
$filename = "pubmed_result.txt";
open FILE, "<:encoding(utf-8)", $filename or die "can't find file ".$filename;
while ($readinline = <FILE>) {
if ($readinline =~ /^\n/) {
next;
}
$pmid = "";
$title = "Title- ";
$abstract = "Abstract- ";
#PMID
$pmid .= $readinline;
while ($readinline = <FILE>) {
if ($readinline =~ /^\n/) {
last; #finish one file, break, do another
}
#title
if ($readinline =~ /TI\s+-/) {
$readinline =~ s/^TI\s+-//g; #remove TI -
$readinline =~ s/^\s+|\s+$//g; #remove leading/trailing spaces
$title .= $readinline;
while ($readinline = <FILE>) {
if ($readinline =~ /^\s/) {
$readinline =~ s/^\s+|\s+$//g;
$title .= (" ".$readinline);
} else {
last;
}
}
}
#abstract
if ($readinline =~ /AB\s+-/) {
$readinline =~ s/^AB\s+-//g; #remove AB -
$readinline =~ s/^\s+|\s+$//g; #remove leading/trailing spaces
$abstract .= $readinline;
while ($readinline = <FILE>) {
if ($readinline =~ /^\s/) {
$readinline =~ s/^\s+|\s+$//g;
$abstract .= (" ".$readinline);
} else {
last;
}
}
}
}
if ($title =~ /Title- \[/ or $abstract =~ /^.{7,12}$/) { #if not english or no abstract, ignore
next;
}
($counter, $abstractSentences) = _split($abstract);
if ($counter > 0) {
print "$pmid" . "$title\n" . $abstractSentences . "\n";
}
}