Skip to content

Commit

Permalink
README and final results
Browse files Browse the repository at this point in the history
  • Loading branch information
fgh95 committed Jan 20, 2021
1 parent c87dce4 commit b692b12
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 38 deletions.
17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@



This repository contains custom pipes and models to classify scientific publications from PubMed depending on whether they estimate pharmacokinetic (PK) parameters from _in vivo_ studies. The final pipeline retrieved more than 120K PK publications and runs weekly updates. All the retrieved data is at https://app.pkpdai.com/
This repository contains custom pipes and models to classify scientific publications from PubMed depending on whether they estimate pharmacokinetic (PK) parameters from _in vivo_ studies. The final pipeline retrieved more than 120K PK publications and runs weekly updates available at https://app.pkpdai.com/.

# Reproduce our results

Expand Down Expand Up @@ -167,9 +167,18 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc
````
python scripts/display_results.py \
--input-dir data/results/distributional\
--output-dir data/final/distributional
--input-dir data/results/distributional \
--output-dir data/final/distributional \
--convert-latex
````
````
python scripts/display_results.py \
--input-dir data/results/distributional/bow_and_distributional \
--output-dir data/final/distributional/bow_and_distributional \
--convert-latex
````
From these plots we can see that the best-performing architecture on the training data, on average, is the one using average embeddings from BioBERT and unigram features.
Expand All @@ -195,7 +204,7 @@ Train the final pipeline (preprocessing, encoding, decoding) from scratch with o
--test-labels data/labels/test_data.csv \
--cv-dir data/results/final-pipeline \
--output-dir data/results/final-pipeline \
--train-pipeline False
--train-pipeline True
````
# Make new predictions
Expand Down
9 changes: 3 additions & 6 deletions data/final/distributional/all_results.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
,Pipeline,Precision,Recall,F1-score,IQV,F1
0,res_biobert_avg_bow,"83.67(76.73,89.06)","80.38(74.05,87.34)","81.73(77.84,86.0)",8.159999999999997,81.73
0,res_biobert_all_bow,"83.77(75.55,88.81)","79.11(73.42,85.44)","81.04(77.17,85.45)",8.280000000000001,81.04
0,res_unigrams,"80.06(73.91,86.0)","82.28(74.05,88.61)","80.62(75.77,85.19)",9.420000000000002,80.62
0,res_biobert_all,"80.13(71.75,85.95)","75.95(69.62,82.93)","77.69(72.67,81.44)",8.769999999999996,77.69
0,res_biobert_avg,"78.14(68.98,85.43)","75.32(68.34,82.91)","76.63(71.63,81.26)",9.63000000000001,76.63
0,res_specter,"74.11(66.46,80.92)","68.99(62.01,76.6)","71.21(66.21,75.85)",9.64,71.21
0,res_biobert_all,"80.1 (71.8,86.0)","75.9 (69.6,82.9)","77.7 (72.7,81.4)",8.700000000000003,77.7
0,res_biobert_avg,"78.1 (69.0,85.4)","75.3 (68.3,82.9)","76.6 (71.6,81.3)",9.700000000000003,76.6
0,res_specter,"74.1 (66.5,80.9)","69.0 (62.0,76.6)","71.2 (66.2,75.8)",9.599999999999994,71.2
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,Pipeline,Precision,Recall,F1-score,IQV,F1
0,res_biobert_avg_bow,"83.7 (76.7,89.1)","80.4 (74.1,87.3)","81.7 (77.8,86.0)",8.200000000000003,81.7
0,res_biobert_all_bow,"83.8 (75.6,88.8)","79.1 (73.4,85.4)","81.0 (77.2,85.4)",8.200000000000003,81.0
0,res_unigrams,"80.1 (73.9,86.0)","82.3 (74.1,88.6)","80.6 (75.8,85.2)",9.400000000000006,80.6
14 changes: 3 additions & 11 deletions data/final/fields/all_results.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,4 @@
,Pipeline,Precision,Recall,F1-score,IQV,F1
0,res_optimal,"80.06 (73.91,86.0)","82.28 (74.05,88.61)","80.62 (75.77,85.19)",9.420000000000002,80.62
0,res_all,"80.12 (72.96,86.13)","81.65 (74.05,87.36)","80.5 (75.71,84.85)",9.14,80.5
0,res_pub_type,"78.05 (71.57,84.32)","81.65 (74.68,87.36)","79.62 (75.54,84.21)",8.669999999999987,79.62
0,res_mesh,"79.22 (72.14,85.15)","79.75 (72.78,86.08)","79.5 (74.34,83.34)",9.0,79.5
0,res_affiliations,"76.65 (69.76,82.07)","80.38 (72.78,86.71)","78.33 (73.25,81.94)",8.689999999999998,78.33
0,res_abstract,"76.95 (69.83,82.55)","79.75 (73.39,86.08)","78.25 (73.62,82.61)",8.989999999999995,78.25
0,res_keywords,"76.6 (70.21,83.02)","80.38 (72.77,85.46)","78.22 (73.81,82.24)",8.429999999999993,78.22
0,res_authors,"76.42 (69.39,82.58)","80.38 (72.78,86.08)","78.19 (73.32,82.64)",9.320000000000007,78.19
0,res_journal,"76.42 (70.22,82.25)","79.75 (72.77,85.44)","77.99 (73.58,81.99)",8.409999999999997,77.99
0,res_chemical,"76.01 (69.51,81.88)","80.38 (73.42,86.08)","77.81 (72.98,81.97)",8.989999999999995,77.81
0,res_title,"65.31 (59.01,72.73)","65.82 (55.03,72.78)","65.03 (59.49,70.95)",11.46,65.03
0,res_unigrams,"80.1 (73.9,86.0)","82.3 (74.1,88.6)","80.6 (75.8,85.2)",9.400000000000006,80.6
0,res_bigrams,"79.9 (72.2,86.9)","81.6 (74.1,88.0)","80.6 (76.2,84.8)",8.599999999999994,80.6
0,res_trigrams,"80.4 (74.4,86.3)","81.0 (73.4,88.0)","80.6 (76.7,84.6)",7.8999999999999915,80.6
6 changes: 3 additions & 3 deletions data/final/ngrams/all_results.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
,Pipeline,Precision,Recall,F1-score,IQV,F1
0,res_unigrams,"80.06(73.91,86.0)","82.28(74.05,88.61)","80.62(75.77,85.19)",9.420000000000002,80.62
0,res_trigrams,"80.36(74.4,86.3)","81.01(73.42,87.97)","80.61(76.68,84.55)",7.86999999999999,80.61
0,res_bigrams,"79.88(72.25,86.9)","81.65(74.05,87.99)","80.6(76.22,84.76)",8.540000000000006,80.6
0,res_unigrams,"80.1 (73.9,86.0)","82.3 (74.1,88.6)","80.6 (75.8,85.2)",9.400000000000006,80.6
0,res_bigrams,"79.9 (72.2,86.9)","81.6 (74.1,88.0)","80.6 (76.2,84.8)",8.599999999999994,80.6
0,res_trigrams,"80.4 (74.4,86.3)","81.0 (73.4,88.0)","80.6 (76.7,84.6)",7.8999999999999915,80.6
38 changes: 25 additions & 13 deletions pk_classifier/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,18 @@ def plot_df(df_results, out_path=None):
plt.close()


def make_results_table2(df_results, pipeline_name):
def make_results_table2(df_results, pipeline_name, round_dec=1):
df_results = df_results[["Precision", "Recall", "F1-score"]]
percentiles = np.transpose(np.quantile(df_results, q=[0.025, 0.5, 0.975], axis=0))
percentiles = np.round(percentiles, 4) * 100
precision = "{0} ({1},{2})".format(str(np.round(percentiles[0][1], 2)), str(np.round(percentiles[0][0], 2)),
str(np.round(percentiles[0][2], 2)))
recall = "{0} ({1},{2})".format(str(np.round(percentiles[1][1], 2)), str(np.round(percentiles[1][0], 2)),
str(np.round(percentiles[1][2], 2)))
f1 = "{0} ({1},{2})".format(str(np.round(percentiles[2][1], 2)), str(np.round(percentiles[2][0], 2)),
str(np.round(percentiles[2][2], 2)))

iqv = np.round(percentiles[2][2], 2) - np.round(percentiles[2][0], 2)
precision = "{0} ({1},{2})".format(str(np.round(percentiles[0][1], round_dec)), str(np.round(percentiles[0][0], round_dec)),
str(np.round(percentiles[0][2], round_dec)))
recall = "{0} ({1},{2})".format(str(np.round(percentiles[1][1], round_dec)), str(np.round(percentiles[1][0], round_dec)),
str(np.round(percentiles[1][2], round_dec)))
f1 = "{0} ({1},{2})".format(str(np.round(percentiles[2][1], round_dec)), str(np.round(percentiles[2][0], round_dec)),
str(np.round(percentiles[2][2], round_dec)))

iqv = np.round(percentiles[2][2], round_dec) - np.round(percentiles[2][0], round_dec)
data = [[pipeline_name, precision, recall, f1, iqv]]
out_dataframe = pd.DataFrame(data, columns=['Pipeline', 'Precision', 'Recall', 'F1-score', 'IQV'])
return out_dataframe
Expand All @@ -45,8 +45,10 @@ def rename(inp_word):
res_abstract='Abstract', res_chemical='Chemicals', res_mesh='MeSH', res_pub_type='Pub. Type',
res_affiliations='Affiliations', res_all='All fields', res_optimal='Opt. Fields',
res_unigrams='Unigrams', res_bigrams='Bigrams', res_trigrams='Trigrams',
res_specter_alone='SPECTER', res_biobert_bow_mean='BioBERT mean pooling',
res_biobert_meanmaxmin='BioBERT mean \n + min + max pooling')
res_specter='SPECTER', res_biobert_avg='BioBERT\nmean pooling',
res_biobert_all='BioBERT mean\n+ min&max pooling', res_biobert_all_bow='Unigrams\n+ BioBERT mean\n+ '
'min&max pooling',
res_biobert_avg_bow='Unigrams\n+ BioBERT\nmean pooling')
if inp_word in mapper.keys():
return mapper[inp_word]
else:
Expand All @@ -57,6 +59,12 @@ def get_all_results(inp_result_files, input_dir, output_dir, convert_latex):
all_results = []
all_for_boxplot = []
all_for_boxplot_names = []

if 'res_unigrams.csv' in inp_result_files:
inp_result_files = inp_result_files[-1:] + inp_result_files[:-1]

print(inp_result_files)

for inp_result in inp_result_files:
instance_df = pd.read_csv(os.path.join(input_dir, inp_result))
all_for_boxplot.append(instance_df["F1-score"].values)
Expand All @@ -71,18 +79,22 @@ def get_all_results(inp_result_files, input_dir, output_dir, convert_latex):
print(all_results_ready)
if convert_latex:
print(all_results_ready.to_latex(index=False))

all_results_ready.to_csv(os.path.join(output_dir, "all_results.csv"))

idx_medians_sorted = np.asarray([np.median(x) for x in all_for_boxplot]).argsort()
# idx_medians_sorted = range(0,len(idx_medians_sorted))
all_for_boxplot = [all_for_boxplot[i] * 100 for i in idx_medians_sorted]
all_for_boxplot_names = [rename(all_for_boxplot_names[i]) for i in idx_medians_sorted]
fig7, ax7 = plt.subplots()
fig7.set_figheight(25)
fig7.set_figheight(40)
fig7.set_figwidth(10)
plt.ylim(55, 90)

plt.ylim(70, 90)
ax7.boxplot(all_for_boxplot, labels=all_for_boxplot_names, whis=[2.5, 97.5])
plt.ylabel('F1-score (%)', fontsize=16)
plt.xticks(rotation=65, fontsize=12)
plt.yticks(fontsize=12)
plt.gcf().subplots_adjust(bottom=0.2)
plt.show()
plt.close()
2 changes: 1 addition & 1 deletion scripts/train_test_final.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def main():
parser.add_argument("--train-pipeline", type=str2bool, nargs='?',
help="Determine whether the input to process is from SPECTER",
const=True,
default=False)
default=True)

args = parser.parse_args()
run(path_train=args.path_train, train_labels=args.train_labels, path_test=args.path_test,
Expand Down

0 comments on commit b692b12

Please sign in to comment.