Skip to content

Commit

Permalink
Merge branch 'master' of github.com:madhugopinathan/madhugopinathan.g…
Browse files Browse the repository at this point in the history
…ithub.io
  • Loading branch information
Madhu Gopinathan committed Jan 26, 2021
2 parents 3349c83 + b49dae8 commit 17d3a57
Showing 1 changed file with 80 additions and 111 deletions.
191 changes: 80 additions & 111 deletions notebooks/deep-nlu/introduction_to_nlp.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@
},
"source": [
"!wget https://s3.amazonaws.com/fast-ai-nlp/yelp_review_full_csv.tgz\n",
"!tar xvfz yelp_review_full_csv.tgz"
"!tar xvfz yelp_review_full_csv.tgz\n",
"#"
],
"execution_count": 1,
"outputs": [
Expand Down Expand Up @@ -102,7 +103,8 @@
"import seaborn as sns\n",
"import spacy\n",
"nlp = spacy.load('en')\n",
"pd.options.display.max_colwidth=-1"
"pd.options.display.max_colwidth=-1\n",
"#"
],
"execution_count": 2,
"outputs": [
Expand Down Expand Up @@ -138,7 +140,8 @@
"source": [
"DATA_DIR = \"./yelp_review_full_csv/\"\n",
"df = pd.read_csv(DATA_DIR + \"train.csv\", header=None, names=['rating', 'review'])\n",
"df[df.review.str.contains(\"indian\")].sample(3)"
"df[df.review.str.contains(\"indian\")].sample(3)\n",
"#"
],
"execution_count": 3,
"outputs": [
Expand Down Expand Up @@ -232,7 +235,8 @@
" are pretty bad as well.\"\"\"\n",
"doc = nlp(text)\n",
"for ent in doc.ents:\n",
" print(ent.label_, ent.lower_, ent.start, ent.end)"
" print(ent.label_, ent.lower_, ent.start, ent.end)\n",
"#"
],
"execution_count": 4,
"outputs": [
Expand Down Expand Up @@ -305,17 +309,8 @@
" return cuisine2idx.setdefault(cuisine, len(cuisine2idx))\n",
"\n",
"def get_token_idx(token):\n",
" return token2idx.setdefault(token, len(token2idx))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "9faL7yUErChr"
},
"source": [
" return token2idx.setdefault(token, len(token2idx))\n",
"\n",
"def build_cuisine_token_mat(df,ws=10):\n",
" row = []\n",
" col = []\n",
Expand All @@ -339,36 +334,37 @@
" do_count_tokens(ent, doc[ent.end:ent.end+ws])\n",
" \n",
" return sparse.csr_matrix((data, (row, col)), \n",
" shape=(len(cuisine2idx), len(token2idx)))"
" shape=(len(cuisine2idx), len(token2idx)))\n",
"#"
],
"execution_count": null,
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8ycnOT_PrQM_",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
"base_uri": "https://localhost:8080/"
},
"outputId": "361bc29f-d9f6-4dac-924a-563c54102eca"
"outputId": "6ae30195-526f-4525-d7f6-c9b0e85c5936"
},
"source": [
"%%time\n",
"np.random.seed(42)\n",
"cuisine2idx = {}\n",
"token2idx = {}\n",
"small_df = df.sample(20000)\n",
"cuisine_token_mat = build_cuisine_token_mat(small_df)"
"cuisine_token_mat = build_cuisine_token_mat(small_df)\n",
"#"
],
"execution_count": null,
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
"CPU times: user 6min, sys: 6.96 s, total: 6min 7s\n",
"Wall time: 6min 8s\n"
"CPU times: user 4min 47s, sys: 7.14 s, total: 4min 55s\n",
"Wall time: 4min 55s\n"
],
"name": "stdout"
}
Expand All @@ -377,67 +373,49 @@
{
"cell_type": "code",
"metadata": {
"id": "dyPp6_JErXUy"
"id": "dyPp6_JErXUy",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "dcb69430-2d09-4958-88cc-6fc8ec4ccbc1"
},
"source": [
"idx2token = {v:k for k,v in token2idx.items()}\n",
"idx2cuisine = {v:k for k,v in cuisine2idx.items()}"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "k6-r1FR7rX3q"
},
"source": [
"idx2cuisine = {v:k for k,v in cuisine2idx.items()}\n",
"\n",
"def get_top_cuisine_indices(mat,n):\n",
" rowsum = np.squeeze(np.asarray(mat.sum(axis=1)))\n",
" return np.argsort(rowsum)[::-1][:n]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "SeEp8Df3urrL",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 357
},
"outputId": "edc2a08f-fcdb-44a7-fbb6-7927ad23f702"
},
"source": [
" return np.argsort(rowsum)[::-1][:n]\n",
"\n",
"for idx in get_top_cuisine_indices(cuisine_token_mat,20):\n",
" print(idx,idx2cuisine[idx])"
" print(idx,idx2cuisine[idx])\n",
"#"
],
"execution_count": null,
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": [
"3 mexican\n",
"15 chinese\n",
"13 italian\n",
"4 mexican\n",
"17 french\n",
"14 italian\n",
"16 chinese\n",
"2 thai\n",
"19 french\n",
"5 asian\n",
"12 american\n",
"24 japanese\n",
"10 indian\n",
"6 asian\n",
"9 \\nthe\n",
"30 japanese\n",
"13 american\n",
"12 indian\n",
"0 greek\n",
"56 korean\n",
"75 irish\n",
"4 vietnamese\n",
"14 venetian\n",
"20 hawaiian\n",
"27 cheese\n",
"78 spanish\n",
"111 german\n",
"63 brazilian\n",
"22 european\n"
"103 irish\n",
"5 vietnamese\n",
"83 korean\n",
"67 hawaiian\n",
"15 venetian\n",
"106 spanish\n",
"34 appetizer\n",
"64 polish\n",
"3 vegetarian\n"
],
"name": "stdout"
}
Expand All @@ -446,55 +424,46 @@
{
"cell_type": "code",
"metadata": {
"id": "0KBfEClSuu_G"
},
"source": [
"def get_top_token_indices(mat,n):\n",
" rowsum = np.squeeze(np.asarray(mat.sum(axis=0)))\n",
" return np.argsort(rowsum)[::-1][:n]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "gBh2xQrMu4L9",
"id": "0KBfEClSuu_G",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 357
"base_uri": "https://localhost:8080/"
},
"outputId": "01af5bb6-82ad-464f-fd65-b8acd8dc207e"
"outputId": "63b1bdcb-6f65-4eea-fc6a-c51a1c62aca5"
},
"source": [
"def get_top_token_indices(mat,n):\n",
" rowsum = np.squeeze(np.asarray(mat.sum(axis=0)))\n",
" return np.argsort(rowsum)[::-1][:n]\n",
" \n",
"for idx in get_top_token_indices(cuisine_token_mat,20):\n",
" print(idx,idx2token[idx])"
" print(idx,idx2token[idx])\n",
"#"
],
"execution_count": null,
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
"9 \n",
"16 food\n",
"8 good\n",
"6 place\n",
"61 like\n",
"65 restaurant\n",
"24 great\n",
"207 ordered\n",
"78 restaurants\n",
"335 better\n",
"63 menu\n",
"157 service\n",
"245 time\n",
"54 try\n",
"548 salad\n",
"466 best\n",
"83 nice\n",
"59 authentic\n",
"746 chicken\n",
"51 little\n"
"4 \n",
"23 food\n",
"5 good\n",
"73 like\n",
"13 place\n",
"76 restaurant\n",
"37 great\n",
"105 ordered\n",
"419 better\n",
"22 salad\n",
"86 restaurants\n",
"255 service\n",
"251 chicken\n",
"560 best\n",
"29 menu\n",
"322 time\n",
"66 try\n",
"181 toast\n",
"352 fries\n",
"91 nice\n"
],
"name": "stdout"
}
Expand Down

0 comments on commit 17d3a57

Please sign in to comment.