forked from sinaptik-ai/pandas-ai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrom_google_sheets.py
125 lines (99 loc) · 3.63 KB
/
from_google_sheets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import re
import requests
import pandasai.pandas as pd
def get_google_sheet(src) -> list:
"""
Returns a 2D array of the contents of the Google Sheet at the given URL
Args:
src (str): The URL of the Google Sheet.
Returns:
list: A 2D array representing content of the Google Sheet.
"""
try:
from bs4 import BeautifulSoup
except ImportError as e:
raise ImportError(
"Could not import beautifulsoup4 python package. "
"Please install it with `pip install beautifulsoup4`."
) from e
# The size of the Google sheet that can be read is limited
raw_html = requests.get(src).text
soup = BeautifulSoup(raw_html, "html.parser")
table = soup.find("tbody")
rows = table.find_all("tr")
grid = []
for row in rows:
cols = row.find_all("td")
clean_row = [col.text for col in cols]
grid.append(clean_row)
return grid
def sheet_to_df(sheet) -> list:
"""
Returns a list of dataframes for each data table in a given spreadsheet
Args:
sheet (list): A 2D array of the contents of the Google Sheet
Returns:
list: A list of dataframes from the Google Sheet.
"""
# A dataframe starts when a header is found
# A header is the first instance of a set of contiguous alphanumeric columns
# A dataframe ends when a blank row is found or an empty column is found
num = 0 # The number of the dataframe
headers = [] # Each header is a tuple (num, row, col_start, col_end)
binding_headers = []
dfs = [] # Each df is a tuple (num, df)
# First pass: get all the headers
for row in range(len(sheet)):
# if every cell in the row is empty, skip row
if all(sheet[row][col].strip() == "" for col in range(len(sheet[row]))):
headers += binding_headers
binding_headers = []
continue
for col in range(len(sheet[row])):
# Check if the cell is bounded by a header
if any(col >= header[2] and col <= header[3] for header in binding_headers):
continue
# Check if the cell is commented out
if sheet[row][col].strip().startswith("//"):
continue
if re.search("[a-zA-Z]", sheet[row][col]):
head_start = col
head_end = col
while head_end < len(sheet[row]) and re.search(
"[a-zA-Z]", sheet[row][head_end]
):
head_end += 1
binding_headers.append([num, row, head_start, head_end])
num += 1
headers += binding_headers
# Second pass: get all the dataframes
for header in headers:
df = []
for row in range(header[1], len(sheet)):
if all(
sheet[row][col].strip() == "" for col in range(header[2], header[3])
):
break
df_row = [sheet[row][col] for col in range(header[2], header[3])]
df.append(df_row)
cols = df[0]
data = df[1:]
df = pd.DataFrame(data, columns=cols)
# Cast all the numeric columns to numeric types
for col in df.columns:
try:
df[col] = pd.to_numeric(df[col])
except ValueError:
pass
dfs.append(df)
return dfs
def from_google_sheets(url) -> list:
"""
Returns the dataframes that are in a Google sheet.
Args:
url (str): The URL of the Google Sheet
Returns:
list: A list of dataframes from the Google Sheet.
"""
sheet = get_google_sheet(url)
return sheet_to_df(sheet)