Add files via upload

Upload Cultural Concepts Adaptation Dataset and Code
zhilizju · Jan 9, 2024 · 25a2ac9 · 25a2ac9
1 parent 99aecfa
commit 25a2ac9
Show file tree

Hide file tree

Showing 8 changed files with 411 additions and 0 deletions.
diff --git a/Buildgraph.py b/Buildgraph.py
@@ -0,0 +1,280 @@
+import networkx as nx
+import requests
+import argparse
+import openai
+from openai import OpenAI
+import pandas as pd
+import os
+from utils import read_source_concepts_from_excel,save_results_to_excel,get_language_full_name,requests_retry_session
+
+os.environ['OPENAI_API_KEY'] = 'your-api-key-here'
+
+def read_source_concepts_from_excel(file_path):
+    df = pd.read_excel(file_path)
+    source_concepts = df['Concept'].tolist()
+    source_concepts = [str(concept).lower() for concept in source_concepts if not pd.isna(concept)]
+    return source_concepts
+
+def save_results_to_excel(results, output_file):
+    df = pd.DataFrame(results, columns=['Source Concept', 'Target Concept', 'Distance'])
+    df.to_excel(output_file, index=False)
+
+
+def get_language_full_name(language_abbr):
+    # Mapping of language abbreviations to full names
+    language_map = {
+        'en': 'English',
+        'zh': 'Chinese',
+        'ta': 'Tamil',
+        'tr': 'Turkish',
+        'sw': 'Swahili',
+        'id': 'Indonesian'
+    }
+
+    # Retrieve and return the full name of the language
+    return language_map.get(language_abbr, "Unknown Language")    
+
+
+
+class CulturalAdaptationGraph:
+    def __init__(self):
+        self.graph = nx.DiGraph()
+
+    def format_concept(self, concept):
+        # Replace spaces with underscores
+        return concept.replace(" ", "_")
+
+    def format_language(self, concept):
+        # Replace underscores with spaces
+        return concept.replace("_", " ")
+
+    def concept_exists_in_conceptnet(self, concept, language):
+        formatted_concept = concept.replace(" ", "_")
+        url = f'http://api.conceptnet.io/c/{language}/{formatted_concept}'
+
+        try:
+            response = requests_retry_session().get(url)
+            if response.status_code == 200:
+                data = response.json()
+                return len(data.get('edges', [])) > 0
+            else:
+                print(f"Error occurred: {response.status_code}")
+                return False
+        except requests.exceptions.ConnectionError as e:
+            print(f"Connection error: {e}")
+            return False   
+
+
+
+    def add_hypernyms(self, concept, language):
+        # Add hypernyms (superordinate concepts)
+        concept = self.format_concept(concept)
+        url = f'http://api.conceptnet.io/query?start=/c/{language}/{concept}&rel=/r/IsA'
+        try:
+            response = requests.get(url).json()
+            for edge in response['edges']:
+                if edge['start']['label'].lower() == concept.lower():
+                    hypernym = edge['end']['label']
+                    self.graph.add_node(hypernym, language=language, type='hypernym')
+                    self.graph.add_edge(concept, hypernym, relation='hypernym')
+        except Exception as e:
+            print(f"Error occurred while fetching hypernyms: {e}")
+
+    def add_hyponyms(self, concept, target_language):
+        # Fetch all relations of a concept, then filter for hyponyms (subordinate concepts)
+        concept = self.format_concept(concept)
+        concept_id = f'/c/{target_language}/{concept}'
+        url = f'http://api.conceptnet.io/query?node={concept_id}&limit=1000'
+        try:
+            response = requests.get(url).json()
+            for edge in response['edges']:
+                # Check for hyponym relation, ensuring it's for the correct concept
+                if edge['rel']['label'] == 'IsA' and edge['end']['@id'].lower() == concept_id:
+                    hyponym_full_id = edge['start']['@id']
+                    # Extract the actual concept part
+                    hyponym = hyponym_full_id.split('/')[-1]
+                    self.graph.add_node(hyponym, language=target_language, type='hyponym')
+                    self.graph.add_edge(concept, hyponym, relation='hyponym')
+        except Exception as e:
+            print(f"Error occurred while fetching hyponyms: {e}")
+
+    def add_translated_synonyms(self, concept, source_language, target_language):
+        # Add synonyms in the target language
+        concept = self.format_concept(concept)
+        url = f'http://api.conceptnet.io/query?start=/c/{source_language}/{concept}&rel=/r/Synonym'
+        try:
+            response = requests.get(url).json()
+            for edge in response['edges']:
+                if edge['end']['language'] == target_language:
+                    translated_synonym = edge['end']['label']
+                    translated_synonym = self.format_concept(translated_synonym)
+                    self.graph.add_node(translated_synonym, language=target_language, type='translated_synonym')
+                    self.graph.add_edge(concept, translated_synonym, relation='translated_synonym')
+        except Exception as e:
+            print(f"Error occurred while fetching translated synonyms: {e}")
+
+    def print_graph_info(self):
+        print("Graph Information")
+        print("=================")
+        print(f"Number of nodes: {self.graph.number_of_nodes()}")
+        print(f"Number of edges: {self.graph.number_of_edges()}")
+        print("\nSample Nodes:")
+        for node in list(self.graph.nodes())[:10]:
+            print(f"Node: {node}, Edges: {list(self.graph.edges(node))[:10]}")     
+
+    def calculate_distances_to_source(self, source_concept, target_language):
+        distances = {}
+        for node in self.graph.nodes:
+            if self.graph.nodes[node].get('language') == target_language:
+                try:
+                    if source_concept in self.graph and node in self.graph:
+                        # Calculate the distance from the source concept to each concept in the target language
+                        distance = nx.shortest_path_length(self.graph, source=source_concept, target=node)
+                        distances[node] = distance
+                except nx.NetworkXNoPath:
+                    # Ignore the node if no path exists
+                    continue
+
+        # Sort the distances
+        sorted_distances = sorted(distances.items(), key=lambda x: x[1])
+        return sorted_distances
+
+
+
+    def call_chatgpt_for_cultural_adaptation(self, concept, source_language, target_language, model="gpt-4", max_tokens=150):
+        api_key = os.getenv('OPENAI_API_KEY')
+        if not api_key:
+            raise ValueError("OpenAI API key is not set in environment variables")
+
+        client = OpenAI(api_key=api_key)
+
+        source_language_full=get_language_full_name(source_language)
+        target_language_full=get_language_full_name(target_language)
+
+        prompt=f"List up to 10 common {target_language_full} concepts from Western culture that can be analogously used to explain the {source_language_full} concept '{concept}'. Only list the concepts themselves, without explanations. Separate each concept with a newline."
+
+        try:
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": prompt}
+                ],
+                max_tokens=max_tokens
+            )
+            return response.choices[0].message
+        except Exception as e:
+            raise Exception(f"Error in calling OpenAI API: {e}")
+
+
+    def add_generated_concepts_to_graph(self, source_concept, response, target_language):
+        concepts = response.content.split('\n')
+        for concept in concepts:
+            if concept.strip():
+
+                concept_clean = concept.split('. ', 1)[-1].strip()
+                self.graph.add_node(concept_clean, language=target_language)
+                self.graph.add_edge(source_concept, concept_clean, relation='cultural_adaptation')
+
+
+
+
+    def run(self, source_concept, source_language, target_language, use_chatgpt):
+        # Check if the source concept exists in ConceptNet
+        concept_found = self.concept_exists_in_conceptnet(source_concept, source_language)
+
+        if concept_found:
+            # Add translated synonyms for the source concept
+            self.add_translated_synonyms(source_concept, source_language, target_language)
+
+            # Add hypernyms for the source concept (first-order)
+            self.add_hypernyms(source_concept, source_language)
+
+            # Process each first-order hypernym
+            if source_concept in self.graph:
+                for hypernym in list(self.graph.successors(source_concept)):
+                    if self.graph.nodes[hypernym].get('type') == 'hypernym':
+                        self.add_translated_synonyms(hypernym, source_language, target_language)
+                        for translated_concept in list(self.graph.successors(hypernym)):
+                            if self.graph.nodes[translated_concept].get('type') == 'translated_synonym':
+                                self.add_hyponyms(translated_concept, target_language)
+
+
+
+                        # Add hypernyms for this hypernym （two-order）
+                        self.add_hypernyms(hypernym, source_language)
+
+
+                        # Process each second-order hypernym
+                        for second_order_hypernym in list(self.graph.successors(hypernym)):
+                            if self.graph.nodes[second_order_hypernym].get('type') == 'hypernym':
+                                self.add_translated_synonyms(second_order_hypernym, source_language, target_language)
+                                for translated_concept in list(self.graph.successors(second_order_hypernym)):
+                                    if self.graph.nodes[translated_concept].get('type') == 'translated_synonym':
+                                        self.add_hyponyms(translated_concept, target_language)
+
+                                    # Process each hyponym of the second-order hypernym
+                                    for one_order_hyponym in list(self.graph.successors(translated_concept)):
+                                        if self.graph.nodes[one_order_hyponym].get('type') == 'hyponym':
+                                            # Add hyponyms of the hyponym (third-order)
+                                            self.add_hyponyms(one_order_hyponym, target_language)
+
+                # For those without hypernyms in the source language, use synonyms in the target language to construct [the graph].         
+                for translated_synonym in list(self.graph.successors(source_concept)):
+                    if self.graph.nodes[translated_synonym].get('type') == 'translated_synonym':
+
+                        # Add hypernyms for this translated_synonym
+                        self.add_hypernyms(translated_synonym, target_language)
+                        for hypernym in list(self.graph.successors(translated_synonym)):
+                            if self.graph.nodes[hypernym].get('type') == 'hypernym':
+                                self.add_hyponyms(hypernym, target_language)
+
+
+                print("source_concept:",source_concept)
+                print("Exist")
+
+        else:
+            print("source_concept:",source_concept)
+            print("No exist")
+
+
+        if not concept_found and use_chatgpt:
+            response = self.call_chatgpt_for_cultural_adaptation(source_concept, source_language, target_language)
+            self.add_generated_concepts_to_graph(source_concept, response, target_language)
+
+
+
+        self.print_graph_info()
+
+        # Calculate and sort distances
+        sorted_distances = self.calculate_distances_to_source(source_concept, target_language)
+        for concept, distance in sorted_distances:
+            print(f"Distance from '{source_concept}' to '{concept}': {distance}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Cultural Adaptation Graph Builder")
+    # parser.add_argument("--source_concept", type=str, help="The source concept")
+    parser.add_argument("--source_language", type=str, help="The source language")
+    parser.add_argument("--target_language", type=str, help="The target language")
+    parser.add_argument("--use_chatgpt", action='store_true', help="Use ChatGPT for fuzzy matching when a concept is not found in ConceptNet")
+    parser.add_argument("--input_file", type=str, help="Path to the input Excel file with source concepts")
+    parser.add_argument("--output_file", type=str, default="output.xlsx", help="Path to the output Excel file for results")
+
+    args = parser.parse_args()
+
+    source_concepts = read_source_concepts_from_excel(args.input_file)
+
+    all_results = []
+
+    for source_concept in source_concepts:
+        cultural_graph = CulturalAdaptationGraph()
+        cultural_graph.run(source_concept, args.source_language, args.target_language, args.use_chatgpt)
+        distances = cultural_graph.calculate_distances_to_source(source_concept, args.target_language)
+
+        for target_concept, distance in distances:
+            all_results.append((source_concept, target_concept, distance))
+
+
+
+    save_results_to_excel(all_results, args.output_file)
+
diff --git a/Chinese_English_adaptation_with_chatgpt.xlsx b/Chinese_English_adaptation_with_chatgpt.xlsx
diff --git a/Indonesia_English_adaptation_with_chatgpt.xlsx b/Indonesia_English_adaptation_with_chatgpt.xlsx
diff --git a/Swahili_English_adaptation_with_chatgpt.xlsx b/Swahili_English_adaptation_with_chatgpt.xlsx
diff --git a/Tamil_English_adaptation_with_chatgpt.xlsx b/Tamil_English_adaptation_with_chatgpt.xlsx
diff --git a/Turkish_English_adaptation_with_chatgpt.xlsx b/Turkish_English_adaptation_with_chatgpt.xlsx
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,79 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: osx-64
+annotated-types=0.6.0=pypi_0
+anyio=4.2.0=pypi_0
+appnope=0.1.3=pyhd8ed1ab_0
+asttokens=2.4.1=pyhd8ed1ab_0
+bzip2=1.0.8=h1de35cc_0
+ca-certificates=2023.11.17=h8857fd0_0
+certifi=2023.11.17=pypi_0
+charset-normalizer=3.3.2=pypi_0
+comm=0.2.1=pyhd8ed1ab_0
+debugpy=1.8.0=py310h9e9d8ca_1
+decorator=5.1.1=pyhd8ed1ab_0
+distro=1.9.0=pypi_0
+et-xmlfile=1.1.0=pypi_0
+exceptiongroup=1.2.0=pyhd8ed1ab_0
+executing=2.0.1=pyhd8ed1ab_0
+h11=0.14.0=pypi_0
+httpcore=1.0.2=pypi_0
+httpx=0.26.0=pypi_0
+idna=3.6=pypi_0
+importlib-metadata=7.0.1=pyha770c72_0
+importlib_metadata=7.0.1=hd8ed1ab_0
+ipykernel=6.28.0=pyh3cd1d5f_0
+ipython=8.19.0=pyh707e725_0
+jedi=0.19.1=pyhd8ed1ab_0
+jupyter_client=8.6.0=pyhd8ed1ab_0
+jupyter_core=5.7.0=py310h2ec42d9_0
+libcxx=16.0.6=hd57cbcb_0
+libffi=3.4.4=hecd8cb5_0
+libsodium=1.0.18=hbcb3906_1
+matplotlib-inline=0.1.6=pyhd8ed1ab_0
+ncurses=6.4=hcec6c5f_0
+nest-asyncio=1.5.8=pyhd8ed1ab_0
+networkx=3.2.1=pypi_0
+numpy=1.26.3=pypi_0
+openai=1.6.1=pypi_0
+openpyxl=3.1.2=pypi_0
+openssl=3.2.0=hd75f5a5_1
+packaging=23.2=pyhd8ed1ab_0
+pandas=2.1.4=pypi_0
+parso=0.8.3=pyhd8ed1ab_0
+pexpect=4.8.0=pyh1a96a4e_2
+pickleshare=0.7.5=py_1003
+pip=23.3.1=py310hecd8cb5_0
+platformdirs=4.1.0=pyhd8ed1ab_0
+prompt-toolkit=3.0.42=pyha770c72_0
+psutil=5.9.7=py310hb372a2b_0
+ptyprocess=0.7.0=pyhd3deb0d_0
+pure_eval=0.2.2=pyhd8ed1ab_0
+pydantic=2.5.3=pypi_0
+pydantic-core=2.14.6=pypi_0
+pygments=2.17.2=pyhd8ed1ab_0
+python=3.10.13=h5ee71fb_0
+python-dateutil=2.8.2=pyhd8ed1ab_0
+python_abi=3.10=2_cp310
+pytz=2023.3.post1=pypi_0
+pyzmq=25.1.2=py310h6b67f7f_0
+readline=8.2=hca72f7f_0
+requests=2.31.0=pypi_0
+setuptools=68.2.2=py310hecd8cb5_0
+six=1.16.0=pyh6c4a22f_0
+sniffio=1.3.0=pypi_0
+sqlite=3.41.2=h6c40b1e_0
+stack_data=0.6.2=pyhd8ed1ab_0
+tk=8.6.12=h5d9f67b_0
+tornado=6.3.3=py310h6729b98_1
+tqdm=4.66.1=pypi_0
+traitlets=5.14.1=pyhd8ed1ab_0
+typing_extensions=4.9.0=pyha770c72_0
+tzdata=2023.4=pypi_0
+urllib3=2.1.0=pypi_0
+wcwidth=0.2.12=pyhd8ed1ab_0
+wheel=0.41.2=py310hecd8cb5_0
+xz=5.4.5=h6c40b1e_0
+zeromq=4.3.5=h93d8f39_0
+zipp=3.17.0=pyhd8ed1ab_0
+zlib=1.2.13=h4dc903c_0