Skip to content

Commit f1f8618

Browse files
committed
update upload to gsheets validations
1 parent 7d87659 commit f1f8618

File tree

3 files changed

+87
-24
lines changed

3 files changed

+87
-24
lines changed

.github/workflows/scrape.yml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
uses: actions/setup-python@v4
1919
with:
2020
python-version: "3.12"
21-
cache: "pip" # This caches pip dependencies
21+
cache: "pip"
2222

2323
- name: Install dependencies
2424
run: |
@@ -28,15 +28,19 @@ jobs:
2828
- name: Install Playwright browsers
2929
run: playwright install --with-deps chromium firefox webkit
3030

31-
- name: Run scraping and upload
31+
- name: Run scraping
3232
env:
33-
GCP_JSON: ${{ secrets.GCP_JSON }}
34-
GOOGLE_SHEETS_ID: ${{ secrets.GOOGLE_SHEETS_ID }}
3533
PYTHONUNBUFFERED: 1
3634
run: |
3735
chmod +x ./scrape.sh
3836
./scrape.sh
39-
python upload_to_sheets.py
37+
38+
- name: Upload to Google Sheets
39+
env:
40+
GCP_JSON: ${{ secrets.GCP_JSON }}
41+
GOOGLE_SHEETS_ID: ${{ secrets.GOOGLE_SHEETS_ID }}
42+
PYTHONUNBUFFERED: 1
43+
run: python upload_to_sheets.py
4044

4145
- name: Archive production artifacts
4246
uses: actions/upload-artifact@v3

public/.DS_Store

-6 KB
Binary file not shown.

upload_to_sheets.py

Lines changed: 78 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,24 @@
11
import os
2-
import csv
32
import json
43
import sys
4+
import time
5+
import random
6+
from dataclasses import dataclass
7+
from contextlib import contextmanager
8+
9+
import pandas as pd
510
from google.oauth2 import service_account
611
from googleapiclient.discovery import build
712
from googleapiclient.errors import HttpError
8-
import time
913

10-
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
11-
MAX_RETRIES = 3
12-
RETRY_DELAY = 5
14+
@dataclass
15+
class Config:
16+
scopes: tuple = ("https://www.googleapis.com/auth/spreadsheets",)
17+
max_retries: int = 3
18+
sheet_range: str = 'Sheet1'
19+
sheet_id: int = 0 # Assumes first sheet in the spreadsheet
20+
21+
config = Config()
1322

1423
def get_env_var(var_name):
1524
value = os.environ.get(var_name)
@@ -23,15 +32,15 @@ def setup_credentials():
2332
try:
2433
creds_dict = json.loads(gcp_json)
2534
return service_account.Credentials.from_service_account_info(
26-
creds_dict, scopes=SCOPES)
35+
creds_dict, scopes=config.scopes)
2736
except json.JSONDecodeError:
2837
print("Error: Invalid JSON in GCP_JSON environment variable")
2938
sys.exit(1)
3039

3140
def read_csv(file_path):
3241
try:
33-
with open(file_path, 'r') as file:
34-
return list(csv.reader(file))
42+
df = pd.read_csv(file_path)
43+
return [df.columns.tolist()] + df.values.tolist()
3544
except FileNotFoundError:
3645
print(f"Error: CSV file not found at {file_path}")
3746
sys.exit(1)
@@ -43,52 +52,102 @@ def validate_data(data):
4352
# Add more validation as needed
4453
return True
4554

55+
@contextmanager
56+
def get_sheets_service(creds):
57+
service = build("sheets", "v4", credentials=creds)
58+
try:
59+
yield service
60+
finally:
61+
service.close()
62+
4663
def upload_to_sheets(service, spreadsheet_id, data):
47-
sheet_range = 'Sheet1'
4864
body = {'values': data}
4965

50-
for attempt in range(MAX_RETRIES):
66+
for attempt in range(config.max_retries):
5167
try:
5268
spreadsheet = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
5369
print(f"Successfully accessed spreadsheet: {spreadsheet['properties']['title']}")
5470

71+
# Clear the sheet
5572
service.spreadsheets().values().clear(
5673
spreadsheetId=spreadsheet_id,
57-
range=sheet_range
74+
range=config.sheet_range
5875
).execute()
5976

77+
# Update values
6078
result = service.spreadsheets().values().update(
6179
spreadsheetId=spreadsheet_id,
62-
range=sheet_range,
80+
range=config.sheet_range,
6381
valueInputOption='RAW',
6482
body=body
6583
).execute()
6684
print(f"{result.get('updatedCells')} cells updated.")
85+
86+
# Format header row as bold and freeze it
87+
requests = [
88+
{
89+
"repeatCell": {
90+
"range": {
91+
"sheetId": config.sheet_id,
92+
"startRowIndex": 0,
93+
"endRowIndex": 1
94+
},
95+
"cell": {
96+
"userEnteredFormat": {
97+
"textFormat": {
98+
"bold": True
99+
}
100+
}
101+
},
102+
"fields": "userEnteredFormat.textFormat.bold"
103+
}
104+
},
105+
{
106+
"updateSheetProperties": {
107+
"properties": {
108+
"sheetId": config.sheet_id,
109+
"gridProperties": {
110+
"frozenRowCount": 1
111+
}
112+
},
113+
"fields": "gridProperties.frozenRowCount"
114+
}
115+
}
116+
]
117+
118+
# Execute the formatting requests
119+
service.spreadsheets().batchUpdate(
120+
spreadsheetId=spreadsheet_id,
121+
body={"requests": requests}
122+
).execute()
123+
124+
print("Header row formatted as bold and frozen.")
67125
return
68126
except HttpError as err:
69127
if err.resp.status in [403, 404]:
70128
print(f"Error {err.resp.status}: {err}")
71129
print("Check spreadsheet ID and service account permissions.")
72130
sys.exit(1)
73-
elif attempt < MAX_RETRIES - 1:
74-
print(f"Attempt {attempt + 1} failed. Retrying in {RETRY_DELAY} seconds...")
75-
time.sleep(RETRY_DELAY)
131+
elif attempt < config.max_retries - 1:
132+
wait_time = (2 ** attempt) + random.uniform(0, 1)
133+
print(f"Attempt {attempt + 1} failed. Retrying in {wait_time:.2f} seconds...")
134+
time.sleep(wait_time)
76135
else:
77-
print(f"Failed after {MAX_RETRIES} attempts: {err}")
136+
print(f"Failed after {config.max_retries} attempts: {err}")
78137
sys.exit(1)
79138

80139
def main():
81140
creds = setup_credentials()
82-
service = build("sheets", "v4", credentials=creds)
83141
spreadsheet_id = get_env_var('GOOGLE_SHEETS_ID')
84142

85143
print(f"Attempting to access spreadsheet with ID: {spreadsheet_id}")
86144

87-
csv_content = read_csv('public/merged.csv')
145+
csv_content = read_csv('output/merged.csv')
88146
if not validate_data(csv_content):
89147
sys.exit(1)
90148

91-
upload_to_sheets(service, spreadsheet_id, csv_content)
149+
with get_sheets_service(creds) as service:
150+
upload_to_sheets(service, spreadsheet_id, csv_content)
92151

93152
if __name__ == "__main__":
94153
main()

0 commit comments

Comments
 (0)