-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathWordcount.py
28 lines (21 loc) · 994 Bytes
/
Wordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pandas as pd
import re
from collections import Counter
# Read the dataset from CSV
df = pd.read_csv(r'Input.csv')
# Regular Expression pattern to match words containing "ology"
pattern_ology = r'\b\w*ology\w*\b'
# Function to extract words containing "ology" from a title
def extract_ology_words(title):
return re.findall(pattern_ology, title, re.IGNORECASE)
# Apply the parsing function to the 'title' column
df['ology'] = df['title'].apply(lambda x: extract_ology_words(str(x)))
# Count the recurrence of words in the 'ology' category
ology_words = [word for sublist in df['ology'].tolist() for word in sublist]
ology_word_counts = Counter(ology_words)
# Sort the word counts from most to least occurrences
sorted_word_counts = ology_word_counts.most_common()
# Print the sorted word counts
print("Word counts in the 'ology' category (sorted from most to least occurrences):")
for word, count in sorted_word_counts:
print(f"{word}: {count}")