Commit e131fc06 authored by dj5vande's avatar dj5vande
Browse files
parents affa0dc0 523c6e8e
US Election Group 17
# US Election Group 17
Members : Eugene Lu and Daniel van den Hoven
Pre-Requisites for running it are
## Pre-Requisites
1. `pip install mysql-connector-python`
2. `pip install python-dotenv`
3. Update .env to enter your credentials in where `DB_USER` represents username for Marmoset4 and `DB_PASS` represents password to login to Marmoset4 and `DB_NAME` represents the name of the database you wish to connect to.
Under the db folder, run the file `create_tables.sql` to setup the relational model for the tables for our project.
Then run `python3 ./import_data.py` to insert the relevant data into the database.
## DB Tools
make sure these commands are run from the directory root
#### To create all the tables and import the data:
`python3 db/db_init.py`
To run the CLI, enter the CLI directory and run `./client.py` in the terminal to start the query.
#### To reset the database:
`python3 db/db_reset.py`
## Running the CLI
`python3 CLI/client.py`
The CLI is an infinite loop until the command quit is entered. A good place to start to understand
the CLI is to run help or to view our video.
the CLI is to run `help` or to view our video.
## Testing
CLI Test Cases are shown under `CLI/TestCases` which represent the tests run to maintain integrity of our client.
Full details of test plan can be viewed under the doc.
This diff is collapsed.
This diff is collapsed.
......@@ -89,30 +89,3 @@ create table county_candidate_statistics (
alter table county_candidate_statistics add index party_idx (party);
alter table county_candidate_statistics add index candidate_idx (candidate);
alter table county_candidate_statistics add index votes_idx (votes);
'''create table poll (
id int,
state_id int,
pollster varchar(50),
cycle int,
sponsors varchar(50),
display_name varchar(50),
fte_grade varchar(2),
sample_size int,
population varchar(2),
methodology varchar(50),
start_date timestamp,
end_date timestamp,
election_date timestamp,
sponsor_candidate varchar(50),
internal boolean,
partisan varchar(10),
tracking boolean,
created_at timestamp,
url varchar(200),
candidate_name varchar(50),
candidate_party varchar(10),
pct decimal(4,2),
primary key (id),
foreign key (state_id) references state(id)
);'''
from import_data import import_data, parse_csvs
import dotenv
import os
import mysql.connector as mysql
# create_tables creates all the tables and indexes in the schema
def create_tables(db):
with open('db/create_tables.sql', 'r') as f, db.cursor() as cursor:
results = cursor.execute(f.read(), multi=True)
for r in results:
print("Running query: ", r)
db.commit()
if __name__ == "__main__":
dotenv.load_dotenv(dotenv.find_dotenv())
username = os.environ.get("DB_USER")
passphrase = os.environ.get("DB_PASS")
dbname = os.environ.get("DB_NAME")
db = mysql.connect(
host="marmoset04.shoshin.uwaterloo.ca",
user=username,
password=passphrase,
database=dbname
)
create_tables(db)
print('done creating tables')
parse_csvs()
print('done parsing csvs')
import_data(db)
print('db initialization complete')
\ No newline at end of file
import dotenv
import os
import mysql.connector as mysql
# drop_tables drops all the tables
def drop_tables(db):
with open('db/drop_tables.sql', 'r') as f, db.cursor() as cursor:
results = cursor.execute(f.read(), multi=True)
for r in results:
print("Running query: ", r)
db.commit()
if __name__ == "__main__":
dotenv.load_dotenv(dotenv.find_dotenv())
username = os.environ.get("DB_USER")
passphrase = os.environ.get("DB_PASS")
dbname = os.environ.get("DB_NAME")
db = mysql.connect(
host="marmoset04.shoshin.uwaterloo.ca",
user=username,
password=passphrase,
database=dbname,
)
drop_tables(db)
print('db reset complete')
\ No newline at end of file
drop table if exists tweet;
drop table if exists user;
drop table if exists county_candidate_statistics;
drop table if exists county;
drop table if exists state;
\ No newline at end of file
......@@ -5,7 +5,7 @@ import dotenv
import mysql.connector as mysql
CSV_BASE_PATH = '../data'
CSV_BASE_PATH = 'data'
COUNTY_STATISTICS_PATH = f'{CSV_BASE_PATH}/county_statistics.csv'
PRESIDENT_COUNTY_PATH = f'{CSV_BASE_PATH}/president_county.csv'
PRESIDENT_COUNTY_CANDIDATE_PATH = f'{CSV_BASE_PATH}/president_county_candidate.csv'
......@@ -91,9 +91,8 @@ states = {} # states stores the mapping: state-name -> state_id
counties = {} # counties stores the mapping: {state-name}_{county-name} -> County(id, lat, long)
users = {} # users maps user_id -> bool
def parse_tweet_csv(raw_path, hashtag, processed_tweets_path, processed_users_path):
with open(raw_path, encoding='utf-8') as raw_file, open(processed_tweets_path, 'w', newline='', encoding='utf-8') as processed_tweets_file, open(processed_users_path, 'w', newline='', encoding='utf-8') as processed_users_file:
with open(raw_path, encoding='utf-8') as raw_file, open(processed_tweets_path, 'a', newline='', encoding='utf-8') as processed_tweets_file, open(processed_users_path, 'w', newline='', encoding='utf-8') as processed_users_file:
# skip the first row since it only includes column names
next(raw_file)
csv_reader = csv.reader(raw_file)
......@@ -103,11 +102,11 @@ def parse_tweet_csv(raw_path, hashtag, processed_tweets_path, processed_users_pa
user_idx = 0
errors = 0
for row in csv_reader:
if idx > 100000:
if idx > 50000: # limit the amount of tweets being imported as inserting into the database is very slow
break
try:
idx += 1
user_id= row[6]
user_id = row[6]
tweet_writer.writerow([idx, user_idx, row[0], row[2], hashtag, row[3], row[4], row[5]])
if not user_id in users:
user_idx += 1
......@@ -126,7 +125,7 @@ def parse_tweet_csv(raw_path, hashtag, processed_tweets_path, processed_users_pa
# lots of corrupted data in csvs so fail silently
errors += 1
pass
print(f'done parsing {idx} rows, {errors} errors.')
print(f'tweets: done parsing {idx} rows, {errors} errors.')
def parse_csvs():
......@@ -214,20 +213,7 @@ def parse_csvs():
parse_tweet_csv(HASHTAG_TRUMP_PATH, 'DonaldTrump', TWEETS_PROCESSED_PATH, USERS_PROCESSED_PATH)
def import_data():
dotenv.load_dotenv(dotenv.find_dotenv())
username = os.environ.get("DB_USER")
passphrase = os.environ.get("DB_PASS")
dbname = os.environ.get("DB_NAME")
db = mysql.connect(
host="marmoset04.shoshin.uwaterloo.ca",
user=username,
password=passphrase,
database=dbname
)
def import_data(db):
cursor = db.cursor()
with open(PRESIDENT_STATE_PROCESSED_PATH) as csvfile:
......@@ -321,8 +307,3 @@ def import_data():
idx += 1
print(f'inserted {rows} rows into tweet table')
db.close()
if __name__ == "__main__":
parse_csvs()
import_data()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment