Commit f024f27a authored by Eugene Lu's avatar Eugene Lu
Browse files

data importing done

parent 34471517
# Created by https://www.gitignore.io/api/visualstudiocode
# Edit at https://www.gitignore.io/?templates=visualstudiocode
### VisualStudioCode ###
.vscode/* # Maybe .vscode/**/* instead - see comments
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### VisualStudioCode Patch ###
# Ignore all local history of files
**/.history
# End of https://www.gitignore.io/api/visualstudiocode
\ No newline at end of file
# Created by https://www.gitignore.io/api/visualstudiocode
# Edit at https://www.gitignore.io/?templates=visualstudiocode
### VisualStudioCode ###
.vscode/* # Maybe .vscode/**/* instead - see comments
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### VisualStudioCode Patch ###
# Ignore all local history of files
**/.history
# End of https://www.gitignore.io/api/visualstudiocode
.env
This diff is collapsed.
This diff is collapsed.
......@@ -32,9 +32,9 @@ create table state (
create table county (
id int,
state_id int,
name varchar(20),
lat decimal(8),
long decimal(8),
name varchar(50),
latitude decimal(8),
longitude decimal(8),
cases int,
deaths int,
population int,
......@@ -86,7 +86,7 @@ create table county_candidate_statistics (
foreign key (county_id) references county(id)
);
create table poll (
'''create table poll (
id int,
state_id int,
pollster varchar(50),
......@@ -111,4 +111,4 @@ create table poll (
pct decimal(4,2),
primary key (id),
foreign key (state_id) references state(id)
);
);'''
import csv
import os
import os
import dotenv
import mysql.connector as mysql
CSV_BASE_PATH = '../data'
......@@ -18,6 +21,8 @@ TRUMP_BIDEN_POLLS_PROCESSED_PATH= f'{CSV_BASE_PATH}/trump_biden_polls_processed.
TWEETS_PROCESSED_PATH = f'{CSV_BASE_PATH}/tweets_processed.csv'
USERS_PROCESSED_PATH = f'{CSV_BASE_PATH}/users_processed.csv'
BATCH_SIZE = 50
STATE_ABBREVIATIONS = {
'AL': 'ALABAMA',
'AK': 'ALASKA',
......@@ -95,13 +100,17 @@ def parse_tweet_csv(raw_path, hashtag, processed_tweets_path, processed_users_pa
tweet_writer = csv.writer(processed_tweets_file)
user_writer = csv.writer(processed_users_file)
idx = 0
user_idx = 0
errors = 0
for row in csv_reader:
if idx > 250000:
break
try:
idx += 1
user_id= row[6]
tweet_writer.writerow([idx, user_id, row[0], row[2], hashtag, row[3], row[4], row[5]])
if not user_id in users:
user_idx += 1
users[user_id] = True
county_id = None
# find user's county
......@@ -112,8 +121,8 @@ def parse_tweet_csv(raw_path, hashtag, processed_tweets_path, processed_users_pa
county_id = county.id
break
user_writer.writerow([row[6], county_id, row[7], row[8], row[9], row[10], row[11]])
except Exception as e:
user_writer.writerow([user_idx, county_id, row[7], row[8], row[9], row[10], row[11]])
except Exception:
# lots of corrupted data in csvs so fail silently
errors += 1
pass
......@@ -205,17 +214,123 @@ def parse_csvs():
parse_tweet_csv(HASHTAG_TRUMP_PATH, 'DonaldTrump', TWEETS_PROCESSED_PATH, USERS_PROCESSED_PATH)
if __name__ == "__main__":
parse_csvs()
def import_data():
dotenv.load_dotenv(dotenv.find_dotenv())
username = os.environ.get("DB_USER")
passphrase = os.environ.get("DB_PASS")
dbname = os.environ.get("DB_NAME")
db = mysql.connect(
host="marmoset04.shoshin.uwaterloo.ca",
user=username,
password=passphrase,
database=dbname
)
cursor = db.cursor()
with open(PRESIDENT_STATE_PROCESSED_PATH) as csvfile:
values = []
csv_reader = csv.reader(csvfile)
file_len = sum(1 for row in csv_reader)
csvfile.seek(0) # go back to top of file
idx = 1
q = 'INSERT IGNORE INTO state VALUES(%s, %s, %s)'
rows = 0
for row in csv_reader:
values.append([None if x == '' else x for x in row])
if idx % BATCH_SIZE == 0 or idx == file_len:
rows += len(values)
cursor.executemany(q, values)
db.commit()
values = []
idx += 1
print(f'inserted {rows} rows into state table')
with open(COUNTY_STATISTICS_PROCESSED_PATH) as csvfile:
values = []
csv_reader = csv.reader(csvfile)
file_len = sum(1 for row in csv_reader)
csvfile.seek(0) # go back to top of file
idx = 1
q = 'INSERT IGNORE INTO county VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
rows = 0
for row in csv_reader:
if idx > 1000:
break
values.append(tuple([None if x == '' else x for x in row]))
if idx % BATCH_SIZE == 0 or idx == file_len:
rows += len(values)
cursor.executemany(q, values)
db.commit()
values = []
idx += 1
print(f'inserted {rows} rows into county table')
with open(PRESIDENT_COUNTY_CANDIDATE_PROCESSED_PATH) as csvfile:
values = []
csv_reader = csv.reader(csvfile)
file_len = sum(1 for row in csv_reader)
csvfile.seek(0) # go back to top of file
idx = 1
q = 'INSERT IGNORE INTO county_candidate_statistics VALUES(%s, %s, %s, %s, %s, %s)'
rows = 0
for row in csv_reader:
if idx > 1000:
break
values.append(tuple([None if x == '' else x for x in row]))
if idx % BATCH_SIZE == 0 or idx == file_len:
rows += len(values)
cursor.executemany(q, values)
db.commit()
values = []
idx += 1
print(f'inserted {rows} rows into county_candidate_statistics table')
with open(USERS_PROCESSED_PATH, encoding='utf-8') as csvfile:
values = []
csv_reader = csv.reader(csvfile)
file_len = sum(1 for row in csv_reader)
csvfile.seek(0) # go back to top of file
idx = 1
q = 'INSERT IGNORE INTO user VALUES(%s, %s, %s, %s, %s, %s, %s)'
rows = 0
for row in csv_reader:
if idx > 1000:
break
values.append(tuple([None if x == '' else x for x in row]))
if idx % BATCH_SIZE == 0 or idx == file_len:
rows += len(values)
cursor.executemany(q, values)
db.commit()
values = []
idx += 1
print(f'inserted {rows} rows into user table')
with open(TWEETS_PROCESSED_PATH, encoding='utf-8') as csvfile:
values = []
csv_reader = csv.reader(csvfile)
file_len = sum(1 for row in csv_reader)
csvfile.seek(0) # go back to top of file
idx = 1
q = 'INSERT IGNORE INTO tweet VALUES(%s, %s, %s, %s, %s, %s, %s, %s)'
rows = 0
for row in csv_reader:
if idx > 1000:
break
values.append(tuple([None if x == '' else x for x in row]))
if idx % BATCH_SIZE == 0 or idx == file_len:
rows += len(values)
cursor.executemany(q, values)
db.commit()
values = []
idx += 1
print(f'inserted {rows} rows into tweet table')
db.close()
if __name__ == "__main__":
#parse_csvs()
import_data()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment