-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Upload Cultural Concepts Adaptation Dataset and Code
- Loading branch information
Showing
8 changed files
with
411 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,280 @@ | ||
import networkx as nx | ||
import requests | ||
import argparse | ||
import openai | ||
from openai import OpenAI | ||
import pandas as pd | ||
import os | ||
from utils import read_source_concepts_from_excel,save_results_to_excel,get_language_full_name,requests_retry_session | ||
|
||
os.environ['OPENAI_API_KEY'] = 'your-api-key-here' | ||
|
||
def read_source_concepts_from_excel(file_path): | ||
df = pd.read_excel(file_path) | ||
source_concepts = df['Concept'].tolist() | ||
source_concepts = [str(concept).lower() for concept in source_concepts if not pd.isna(concept)] | ||
return source_concepts | ||
|
||
def save_results_to_excel(results, output_file): | ||
df = pd.DataFrame(results, columns=['Source Concept', 'Target Concept', 'Distance']) | ||
df.to_excel(output_file, index=False) | ||
|
||
|
||
def get_language_full_name(language_abbr): | ||
# Mapping of language abbreviations to full names | ||
language_map = { | ||
'en': 'English', | ||
'zh': 'Chinese', | ||
'ta': 'Tamil', | ||
'tr': 'Turkish', | ||
'sw': 'Swahili', | ||
'id': 'Indonesian' | ||
} | ||
|
||
# Retrieve and return the full name of the language | ||
return language_map.get(language_abbr, "Unknown Language") | ||
|
||
|
||
|
||
class CulturalAdaptationGraph: | ||
def __init__(self): | ||
self.graph = nx.DiGraph() | ||
|
||
def format_concept(self, concept): | ||
# Replace spaces with underscores | ||
return concept.replace(" ", "_") | ||
|
||
def format_language(self, concept): | ||
# Replace underscores with spaces | ||
return concept.replace("_", " ") | ||
|
||
def concept_exists_in_conceptnet(self, concept, language): | ||
formatted_concept = concept.replace(" ", "_") | ||
url = f'http://api.conceptnet.io/c/{language}/{formatted_concept}' | ||
|
||
try: | ||
response = requests_retry_session().get(url) | ||
if response.status_code == 200: | ||
data = response.json() | ||
return len(data.get('edges', [])) > 0 | ||
else: | ||
print(f"Error occurred: {response.status_code}") | ||
return False | ||
except requests.exceptions.ConnectionError as e: | ||
print(f"Connection error: {e}") | ||
return False | ||
|
||
|
||
|
||
def add_hypernyms(self, concept, language): | ||
# Add hypernyms (superordinate concepts) | ||
concept = self.format_concept(concept) | ||
url = f'http://api.conceptnet.io/query?start=/c/{language}/{concept}&rel=/r/IsA' | ||
try: | ||
response = requests.get(url).json() | ||
for edge in response['edges']: | ||
if edge['start']['label'].lower() == concept.lower(): | ||
hypernym = edge['end']['label'] | ||
self.graph.add_node(hypernym, language=language, type='hypernym') | ||
self.graph.add_edge(concept, hypernym, relation='hypernym') | ||
except Exception as e: | ||
print(f"Error occurred while fetching hypernyms: {e}") | ||
|
||
def add_hyponyms(self, concept, target_language): | ||
# Fetch all relations of a concept, then filter for hyponyms (subordinate concepts) | ||
concept = self.format_concept(concept) | ||
concept_id = f'/c/{target_language}/{concept}' | ||
url = f'http://api.conceptnet.io/query?node={concept_id}&limit=1000' | ||
try: | ||
response = requests.get(url).json() | ||
for edge in response['edges']: | ||
# Check for hyponym relation, ensuring it's for the correct concept | ||
if edge['rel']['label'] == 'IsA' and edge['end']['@id'].lower() == concept_id: | ||
hyponym_full_id = edge['start']['@id'] | ||
# Extract the actual concept part | ||
hyponym = hyponym_full_id.split('/')[-1] | ||
self.graph.add_node(hyponym, language=target_language, type='hyponym') | ||
self.graph.add_edge(concept, hyponym, relation='hyponym') | ||
except Exception as e: | ||
print(f"Error occurred while fetching hyponyms: {e}") | ||
|
||
def add_translated_synonyms(self, concept, source_language, target_language): | ||
# Add synonyms in the target language | ||
concept = self.format_concept(concept) | ||
url = f'http://api.conceptnet.io/query?start=/c/{source_language}/{concept}&rel=/r/Synonym' | ||
try: | ||
response = requests.get(url).json() | ||
for edge in response['edges']: | ||
if edge['end']['language'] == target_language: | ||
translated_synonym = edge['end']['label'] | ||
translated_synonym = self.format_concept(translated_synonym) | ||
self.graph.add_node(translated_synonym, language=target_language, type='translated_synonym') | ||
self.graph.add_edge(concept, translated_synonym, relation='translated_synonym') | ||
except Exception as e: | ||
print(f"Error occurred while fetching translated synonyms: {e}") | ||
|
||
def print_graph_info(self): | ||
print("Graph Information") | ||
print("=================") | ||
print(f"Number of nodes: {self.graph.number_of_nodes()}") | ||
print(f"Number of edges: {self.graph.number_of_edges()}") | ||
print("\nSample Nodes:") | ||
for node in list(self.graph.nodes())[:10]: | ||
print(f"Node: {node}, Edges: {list(self.graph.edges(node))[:10]}") | ||
|
||
def calculate_distances_to_source(self, source_concept, target_language): | ||
distances = {} | ||
for node in self.graph.nodes: | ||
if self.graph.nodes[node].get('language') == target_language: | ||
try: | ||
if source_concept in self.graph and node in self.graph: | ||
# Calculate the distance from the source concept to each concept in the target language | ||
distance = nx.shortest_path_length(self.graph, source=source_concept, target=node) | ||
distances[node] = distance | ||
except nx.NetworkXNoPath: | ||
# Ignore the node if no path exists | ||
continue | ||
|
||
# Sort the distances | ||
sorted_distances = sorted(distances.items(), key=lambda x: x[1]) | ||
return sorted_distances | ||
|
||
|
||
|
||
def call_chatgpt_for_cultural_adaptation(self, concept, source_language, target_language, model="gpt-4", max_tokens=150): | ||
api_key = os.getenv('OPENAI_API_KEY') | ||
if not api_key: | ||
raise ValueError("OpenAI API key is not set in environment variables") | ||
|
||
client = OpenAI(api_key=api_key) | ||
|
||
source_language_full=get_language_full_name(source_language) | ||
target_language_full=get_language_full_name(target_language) | ||
|
||
prompt=f"List up to 10 common {target_language_full} concepts from Western culture that can be analogously used to explain the {source_language_full} concept '{concept}'. Only list the concepts themselves, without explanations. Separate each concept with a newline." | ||
|
||
try: | ||
response = client.chat.completions.create( | ||
model=model, | ||
messages=[ | ||
{"role": "system", "content": "You are a helpful assistant."}, | ||
{"role": "user", "content": prompt} | ||
], | ||
max_tokens=max_tokens | ||
) | ||
return response.choices[0].message | ||
except Exception as e: | ||
raise Exception(f"Error in calling OpenAI API: {e}") | ||
|
||
|
||
def add_generated_concepts_to_graph(self, source_concept, response, target_language): | ||
concepts = response.content.split('\n') | ||
for concept in concepts: | ||
if concept.strip(): | ||
|
||
concept_clean = concept.split('. ', 1)[-1].strip() | ||
self.graph.add_node(concept_clean, language=target_language) | ||
self.graph.add_edge(source_concept, concept_clean, relation='cultural_adaptation') | ||
|
||
|
||
|
||
|
||
def run(self, source_concept, source_language, target_language, use_chatgpt): | ||
# Check if the source concept exists in ConceptNet | ||
concept_found = self.concept_exists_in_conceptnet(source_concept, source_language) | ||
|
||
if concept_found: | ||
# Add translated synonyms for the source concept | ||
self.add_translated_synonyms(source_concept, source_language, target_language) | ||
|
||
# Add hypernyms for the source concept (first-order) | ||
self.add_hypernyms(source_concept, source_language) | ||
|
||
# Process each first-order hypernym | ||
if source_concept in self.graph: | ||
for hypernym in list(self.graph.successors(source_concept)): | ||
if self.graph.nodes[hypernym].get('type') == 'hypernym': | ||
self.add_translated_synonyms(hypernym, source_language, target_language) | ||
for translated_concept in list(self.graph.successors(hypernym)): | ||
if self.graph.nodes[translated_concept].get('type') == 'translated_synonym': | ||
self.add_hyponyms(translated_concept, target_language) | ||
|
||
|
||
|
||
# Add hypernyms for this hypernym (two-order) | ||
self.add_hypernyms(hypernym, source_language) | ||
|
||
|
||
# Process each second-order hypernym | ||
for second_order_hypernym in list(self.graph.successors(hypernym)): | ||
if self.graph.nodes[second_order_hypernym].get('type') == 'hypernym': | ||
self.add_translated_synonyms(second_order_hypernym, source_language, target_language) | ||
for translated_concept in list(self.graph.successors(second_order_hypernym)): | ||
if self.graph.nodes[translated_concept].get('type') == 'translated_synonym': | ||
self.add_hyponyms(translated_concept, target_language) | ||
|
||
# Process each hyponym of the second-order hypernym | ||
for one_order_hyponym in list(self.graph.successors(translated_concept)): | ||
if self.graph.nodes[one_order_hyponym].get('type') == 'hyponym': | ||
# Add hyponyms of the hyponym (third-order) | ||
self.add_hyponyms(one_order_hyponym, target_language) | ||
|
||
# For those without hypernyms in the source language, use synonyms in the target language to construct [the graph]. | ||
for translated_synonym in list(self.graph.successors(source_concept)): | ||
if self.graph.nodes[translated_synonym].get('type') == 'translated_synonym': | ||
|
||
# Add hypernyms for this translated_synonym | ||
self.add_hypernyms(translated_synonym, target_language) | ||
for hypernym in list(self.graph.successors(translated_synonym)): | ||
if self.graph.nodes[hypernym].get('type') == 'hypernym': | ||
self.add_hyponyms(hypernym, target_language) | ||
|
||
|
||
print("source_concept:",source_concept) | ||
print("Exist") | ||
|
||
else: | ||
print("source_concept:",source_concept) | ||
print("No exist") | ||
|
||
|
||
if not concept_found and use_chatgpt: | ||
response = self.call_chatgpt_for_cultural_adaptation(source_concept, source_language, target_language) | ||
self.add_generated_concepts_to_graph(source_concept, response, target_language) | ||
|
||
|
||
|
||
self.print_graph_info() | ||
|
||
# Calculate and sort distances | ||
sorted_distances = self.calculate_distances_to_source(source_concept, target_language) | ||
for concept, distance in sorted_distances: | ||
print(f"Distance from '{source_concept}' to '{concept}': {distance}") | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Cultural Adaptation Graph Builder") | ||
# parser.add_argument("--source_concept", type=str, help="The source concept") | ||
parser.add_argument("--source_language", type=str, help="The source language") | ||
parser.add_argument("--target_language", type=str, help="The target language") | ||
parser.add_argument("--use_chatgpt", action='store_true', help="Use ChatGPT for fuzzy matching when a concept is not found in ConceptNet") | ||
parser.add_argument("--input_file", type=str, help="Path to the input Excel file with source concepts") | ||
parser.add_argument("--output_file", type=str, default="output.xlsx", help="Path to the output Excel file for results") | ||
|
||
args = parser.parse_args() | ||
|
||
source_concepts = read_source_concepts_from_excel(args.input_file) | ||
|
||
all_results = [] | ||
|
||
for source_concept in source_concepts: | ||
cultural_graph = CulturalAdaptationGraph() | ||
cultural_graph.run(source_concept, args.source_language, args.target_language, args.use_chatgpt) | ||
distances = cultural_graph.calculate_distances_to_source(source_concept, args.target_language) | ||
|
||
for target_concept, distance in distances: | ||
all_results.append((source_concept, target_concept, distance)) | ||
|
||
|
||
|
||
save_results_to_excel(all_results, args.output_file) | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# This file may be used to create an environment using: | ||
# $ conda create --name <env> --file <this file> | ||
# platform: osx-64 | ||
annotated-types=0.6.0=pypi_0 | ||
anyio=4.2.0=pypi_0 | ||
appnope=0.1.3=pyhd8ed1ab_0 | ||
asttokens=2.4.1=pyhd8ed1ab_0 | ||
bzip2=1.0.8=h1de35cc_0 | ||
ca-certificates=2023.11.17=h8857fd0_0 | ||
certifi=2023.11.17=pypi_0 | ||
charset-normalizer=3.3.2=pypi_0 | ||
comm=0.2.1=pyhd8ed1ab_0 | ||
debugpy=1.8.0=py310h9e9d8ca_1 | ||
decorator=5.1.1=pyhd8ed1ab_0 | ||
distro=1.9.0=pypi_0 | ||
et-xmlfile=1.1.0=pypi_0 | ||
exceptiongroup=1.2.0=pyhd8ed1ab_0 | ||
executing=2.0.1=pyhd8ed1ab_0 | ||
h11=0.14.0=pypi_0 | ||
httpcore=1.0.2=pypi_0 | ||
httpx=0.26.0=pypi_0 | ||
idna=3.6=pypi_0 | ||
importlib-metadata=7.0.1=pyha770c72_0 | ||
importlib_metadata=7.0.1=hd8ed1ab_0 | ||
ipykernel=6.28.0=pyh3cd1d5f_0 | ||
ipython=8.19.0=pyh707e725_0 | ||
jedi=0.19.1=pyhd8ed1ab_0 | ||
jupyter_client=8.6.0=pyhd8ed1ab_0 | ||
jupyter_core=5.7.0=py310h2ec42d9_0 | ||
libcxx=16.0.6=hd57cbcb_0 | ||
libffi=3.4.4=hecd8cb5_0 | ||
libsodium=1.0.18=hbcb3906_1 | ||
matplotlib-inline=0.1.6=pyhd8ed1ab_0 | ||
ncurses=6.4=hcec6c5f_0 | ||
nest-asyncio=1.5.8=pyhd8ed1ab_0 | ||
networkx=3.2.1=pypi_0 | ||
numpy=1.26.3=pypi_0 | ||
openai=1.6.1=pypi_0 | ||
openpyxl=3.1.2=pypi_0 | ||
openssl=3.2.0=hd75f5a5_1 | ||
packaging=23.2=pyhd8ed1ab_0 | ||
pandas=2.1.4=pypi_0 | ||
parso=0.8.3=pyhd8ed1ab_0 | ||
pexpect=4.8.0=pyh1a96a4e_2 | ||
pickleshare=0.7.5=py_1003 | ||
pip=23.3.1=py310hecd8cb5_0 | ||
platformdirs=4.1.0=pyhd8ed1ab_0 | ||
prompt-toolkit=3.0.42=pyha770c72_0 | ||
psutil=5.9.7=py310hb372a2b_0 | ||
ptyprocess=0.7.0=pyhd3deb0d_0 | ||
pure_eval=0.2.2=pyhd8ed1ab_0 | ||
pydantic=2.5.3=pypi_0 | ||
pydantic-core=2.14.6=pypi_0 | ||
pygments=2.17.2=pyhd8ed1ab_0 | ||
python=3.10.13=h5ee71fb_0 | ||
python-dateutil=2.8.2=pyhd8ed1ab_0 | ||
python_abi=3.10=2_cp310 | ||
pytz=2023.3.post1=pypi_0 | ||
pyzmq=25.1.2=py310h6b67f7f_0 | ||
readline=8.2=hca72f7f_0 | ||
requests=2.31.0=pypi_0 | ||
setuptools=68.2.2=py310hecd8cb5_0 | ||
six=1.16.0=pyh6c4a22f_0 | ||
sniffio=1.3.0=pypi_0 | ||
sqlite=3.41.2=h6c40b1e_0 | ||
stack_data=0.6.2=pyhd8ed1ab_0 | ||
tk=8.6.12=h5d9f67b_0 | ||
tornado=6.3.3=py310h6729b98_1 | ||
tqdm=4.66.1=pypi_0 | ||
traitlets=5.14.1=pyhd8ed1ab_0 | ||
typing_extensions=4.9.0=pyha770c72_0 | ||
tzdata=2023.4=pypi_0 | ||
urllib3=2.1.0=pypi_0 | ||
wcwidth=0.2.12=pyhd8ed1ab_0 | ||
wheel=0.41.2=py310hecd8cb5_0 | ||
xz=5.4.5=h6c40b1e_0 | ||
zeromq=4.3.5=h93d8f39_0 | ||
zipp=3.17.0=pyhd8ed1ab_0 | ||
zlib=1.2.13=h4dc903c_0 |
Oops, something went wrong.