Add files via upload

scomri · Jan 15, 2023 · fe8cccf · fe8cccf
1 parent 6db0552
commit fe8cccf
Show file tree

Hide file tree

Showing 3 changed files with 857 additions and 0 deletions.
diff --git a/BM_25_from_index.py b/BM_25_from_index.py
@@ -0,0 +1,165 @@
+import math
+import numpy as np
+import pandas as pd
+
+
+def get_top_n(sim_dict, N=40):
+    """
+    Sort and return the highest N documents according to the cosine similarity score.
+    Generate a dictionary of cosine similarity scores
+    Parameters:
+    -----------
+    sim_dict: a dictionary of similarity score as follows:
+        key: document id (e.g., doc_id)
+        value: similarity score. We keep up to 5 digits after the decimal point. (e.g., round(score,5))
+    N: Integer (how many documents to retrieve). By default N = 3
+    Returns:
+    -----------
+    a ranked list of pairs (doc_id, score) in the length of N.
+    """
+    fetch_score = lambda item: item[1]
+    topN = sorted([(doc_id, score) for doc_id, score in sim_dict.items()], key=fetch_score, reverse=True)[:N]
+    return topN
+
+
+class BM_25_from_index:
+    """
+    Best Match 25.
+    ----------
+    k1 : float, default 1.2
+
+    b : float, default 0.75
+
+    index: inverted index
+    """
+
+    def __init__(self, index, avg_doclen, k1=1.2, b=0.75):
+        self.b = b
+        self.k1 = k1
+        self.index = index
+        self.N = len(index.doclen)
+        self.idf = None
+        self.AVGDL = avg_doclen
+
+    def calc_idf(self, list_of_tokens):
+        """
+        This function calculate the idf values according to the BM25 idf formula for each term in the query.
+
+        Parameters:
+        -----------
+        query: list of token representing the query. For example: ['look', 'blue', 'sky']
+
+        Returns:
+        -----------
+        idf: dictionary of idf scores. As follows:
+                                                    key: term
+                                                    value: BM25 idf score
+        """
+        idf = {}
+        for term in list_of_tokens:
+            if term in self.index.df.keys():
+                n_ti = self.index.df[term]
+                # Notice: usage of log2 instead of log
+                idf[term] = math.log2(1 + (self.N - n_ti + 0.5) / (n_ti + 0.5))
+            else:
+                pass
+        return idf
+
+    def get_candidate_documents_list(self, query, words, pls):
+        """
+        Generate a list representing a pool of candidate documents for a given query.
+        This function will go through every token in query and populate the list 'candidate_docid_list'.
+
+        Parameters:
+        -----------
+        query: list of tokens (str). This list will be preprocessed in advance (e.g., lower case, filtering stopwords, etc.').
+                     Example: 'Hello, I love information retrival' --->  ['hello','love','information','retrieval']
+        index: inverted index loaded from the corresponding files.
+        words,pls: generator for working with posting.
+
+        Returns:
+        -----------
+        list of candidate_docid_list doc_ids.
+        """
+        candidates_list = []
+        words_unq = set(words)
+        for term in np.unique(query):
+            if term in words_unq:
+                term_list_of_doc = (pls[words.index(term)])
+                for docid_tf in term_list_of_doc:
+                    cand_doc_id = docid_tf[0]
+                    candidates_list.append(cand_doc_id)
+        return np.unique(candidates_list)
+
+    def search(self, query, N=40, query_words=None, query_pls=None):
+        """
+        This function calculate the BM25 score for given query and document.
+        We need to check only documents which are 'candidate_docid_list' for a given query.
+        This function return a ranked list of pairs (doc_id, score) in the length of N.
+
+        Parameters:
+        -----------
+        query: list of token representing the query. For example: ['look', 'blue', 'sky']
+        N: integer: number of relevant documents to return.
+
+        Returns:
+        -----------
+        a ranked list of pairs (doc_id, score) in the length of N.
+        """
+        candidate_docs_list = self.get_candidate_documents_list(query, query_words, query_pls)
+        self.idf = self.calc_idf(query)  # need to create idf attr for this class
+        scores_dict = self._score(query, candidate_docs_list, query_words, query_pls)
+        return get_top_n(scores_dict, N)
+
+    def calc_bm25_formula(self, doc_id, term_frequencies, idf):
+        """
+        This function calculate the BM25 score by its formula.
+
+        Parameters:
+        -----------
+        doc_id: current document id.
+        term_frequencies: dictionary of term frequencies by document ids.
+        idf: idf value of current document id.
+
+        Returns:
+        -----------
+        float: BM25 score.
+        """
+        doc_len = self.index.doclen.get(doc_id, 0)
+        doc_freq = term_frequencies.get(doc_id, 0)
+        bm25_numerator = idf * doc_freq * (self.k1 + 1)
+        bm25_denominator = doc_freq + self.k1 * (1 - self.b + self.b * doc_len / self.AVGDL)
+        return bm25_numerator / bm25_denominator
+
+    def _score(self, query, candidate_docid_list, query_words=None, query_pls=None):
+        """
+        This function calculate the BM25 score for given query and candidate documents.
+
+        Parameters:
+        -----------
+        query: list of token representing the query. For example: ['look', 'blue', 'sky']
+        candidate_docid_list: list of doc_ids.
+
+        Returns:
+        -----------
+        dictionary of scores: float, BM25 score by doc_ids.
+        """
+        df_docid_scores = pd.DataFrame({'doc_id': candidate_docid_list})
+        df_docid_scores["score"] = 0
+        # TODO: term_total
+        # print(f"term_total len: {len(self.index.term_total)}")
+        for term in query:
+            try:
+                # if term in self.index.term_total.keys():
+                query_term_freq = dict(query_pls[query_words.index(term)])
+                idf = self.idf.get(term, 0)
+                df_docid_scores["score"] += df_docid_scores["doc_id"].apply(lambda doc_id: self.calc_bm25_formula(
+                                                                            doc_id, query_term_freq, idf))
+            except:
+                # print("DIDN'T FIND TERM IN QUERY / TERM TOTAL")
+                pass
+
+        scores_values = df_docid_scores["score"].values
+        df_dict_index_docid = df_docid_scores["doc_id"]
+        doc_scores_dict = pd.Series(scores_values, index=df_dict_index_docid).to_dict()
+        return doc_scores_dict
diff --git a/run_frontend_in_gcp.sh b/run_frontend_in_gcp.sh
@@ -0,0 +1,62 @@
+INSTANCE_NAME="instance-2"
+REGION=us-central1
+ZONE=us-central1-a
+PROJECT_NAME="OmriShaharIRProject"
+IP_NAME="$PROJECT_NAME-ip"
+GOOGLE_ACCOUNT_NAME="omrisgan" # without the @post.bgu.ac.il or @gmail.com part
+
+# 0. Install Cloud SDK on your local machine or using Could Shell
+# check that you have a proper active account listed
+gcloud auth list 
+# check that the right project and zone are active
+gcloud config list
+# if not set them
+# gcloud config set project $PROJECT_NAME
+# gcloud config set compute/zone $ZONE
+
+# 1. Set up public IP
+gcloud compute addresses create $IP_NAME --project=$PROJECT_NAME --region=$REGION
+gcloud compute addresses list
+# note the IP address printed above, that's your extrenal IP address.
+# Enter it here: 
+INSTANCE_IP="35.226.91.7"
+
+# 2. Create Firewall rule to allow traffic to port 8080 on the instance
+gcloud compute firewall-rules create default-allow-http-8080 \
+  --allow tcp:8080 \
+  --source-ranges 0.0.0.0/0 \
+  --target-tags http-server
+
+# 3. Create the instance. Change to a larger instance (larger than e2-micro) as needed.
+gcloud compute instances create $INSTANCE_NAME \
+  --zone=$ZONE \
+  --machine-type=e2-standard-8 \
+  --network-interface=address=$INSTANCE_IP,network-tier=PREMIUM,subnet=default \
+  --metadata startup-script-url=https://storage.cloud.google.com/206201667_316399773_bucket/startup_script_gcp.sh \
+  --scopes=https://www.googleapis.com/auth/cloud-platform \
+  --tags=http-server
+# monitor instance creation log using this command. When done (4-5 minutes) terminate using Ctrl+C
+gcloud compute instances tail-serial-port-output $INSTANCE_NAME --zone $ZONE
+
+# 4. Secure copy your app to the VM
+gcloud compute scp LOCAL_PATH_TO/search_frontend.py $GOOGLE_ACCOUNT_NAME@$INSTANCE_NAME:/home/$GOOGLE_ACCOUNT_NAME
+
+# 5. SSH to your VM and start the app
+gcloud compute ssh $GOOGLE_ACCOUNT_NAME@$INSTANCE_NAME
+python3 search_frontend.py
+
+################################################################################
+# Clean up commands to undo the above set up and avoid unnecessary charges
+gcloud compute instances delete -q $INSTANCE_NAME
+# make sure there are no lingering instances
+gcloud compute instances list
+# delete firewall rule
+gcloud compute firewall-rules delete -q default-allow-http-8080
+# delete external addresses
+gcloud compute addresses delete -q $IP_NAME --region $REGION
+
+##
+# gsutil cp -r gs://206201667_316399773_bucket/postings_gcp .
+# gsutil cp -r gs://206201667_316399773_bucket/page_rank_normalized.pkl .
+# gsutil cp -r gs://206201667_316399773_bucket/page_views_normalized.pkl .
+# CHANGE DISK TO 100GB