Data-Preparation

2 minute read

Merging the Data and Storing in a Database/Visualizing Data

Now that you have cleaned and transformed your 3 datasets, you need to merge them together and load the data into a database. You can choose what kind of database (SQLLite or MySQL, Postgre SQL are all free options).

Once all the data is merged together in your database, create 5 visualizations that demonstrate the data you have cleansed. You should have at least 2 visualizations that have data from more than one source.

For the visualization portion of the project, you are welcome to use a python library like Matplotlib, Seaborn, or an R package ggPlot2, Plotly, or Tableau/PowerBI.

Get data from the web

countries = ["korea-south", "united-states", "india", "nicaragua"]
baseURLa = "https://api.covid19api.com/total/country/"
baseURLb = "/status/confirmed?from=2020-01-01T00:00:00Z&to=2020-05-20T00:00:00Z"

load libraries - use the connect function to connect to the database

import requests
import mysql.connector
connection = mysql.connector.connect(user='root', password='123456789', host='127.0.0.1', database="covid_data")
cursor = connection.cursor(buffered=True)
korea_request = requests.get(baseURLa + countries[0] + baseURLb)

korea_data = korea_request.json()

print(korea_data[0])
{'Country': 'Korea (South)', 'CountryCode': '', 'Province': '', 'City': '', 'CityCode': '', 'Lat': '0', 'Lon': '0', 'Cases': 1, 'Status': 'confirmed', 'Date': '2020-01-22T00:00:00Z'}
for day in korea_data:
    query = ("INSERT INTO covid_data.new_table (country, cases, province, date) VALUES (%s, %s, %s, %s)")

    day["Date"] = day["Date"].split("T")[0]
  
    cursor.execute(query, (day["Country"], day["Cases"], day["Province"], day["Date"]))
    connection.commit()
nicaragua_request = requests.get(baseURLa + countries[3] + baseURLb)

nicaragua_data = nicaragua_request.json()

print(nicaragua_data[0])
{'Country': 'Nicaragua', 'CountryCode': '', 'Province': '', 'City': '', 'CityCode': '', 'Lat': '0', 'Lon': '0', 'Cases': 0, 'Status': 'confirmed', 'Date': '2020-01-22T00:00:00Z'}
for day in nicaragua_data:
    query = ("INSERT INTO covid_data.new_table (country, cases, province, date) VALUES (%s, %s, %s, %s)")
    
    day["Date"] = day["Date"].split("T")[0]
    cursor.execute(query, (day["Country"], day["Cases"], day["Province"], day["Date"]))
    connection.commit()
us_request = requests.get(baseURLa + countries[1] + baseURLb)

us_data = us_request.json()

print(us_data[0])
{'Country': 'United States of America', 'CountryCode': '', 'Province': '', 'City': '', 'CityCode': '', 'Lat': '0', 'Lon': '0', 'Cases': 1, 'Status': 'confirmed', 'Date': '2020-01-22T00:00:00Z'}
#for day in us_data:
#    query = ("INSERT INTO covid_data.new_table (country, cases, province, date) VALUES (%s, %s, %s, %s)")
#    day["Date"] = day["Date"].split("T")[0]
    
#    cursor.execute(query, (day["Country"], day["Cases"], day["Province"], day["Date"]))
#    connection.commit()
india_request = requests.get(baseURLa + countries[2] + baseURLb)

india_data = india_request.json()

print(india_data[0])
{'Country': 'India', 'CountryCode': '', 'Province': '', 'City': '', 'CityCode': '', 'Lat': '0', 'Lon': '0', 'Cases': 0, 'Status': 'confirmed', 'Date': '2020-01-22T00:00:00Z'}
#for day in india_data:
#    query = ("INSERT INTO covid_data.new_table (country, cases, province, date) VALUES (%s, %s, %s, %s)")
#    day["Date"] = day["Date"].split("T")[0]
    
#    cursor.execute(query, (day["Country"], day["Cases"], day["Province"], day["Date"]))
#    connection.commit()
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
#get all cases from the database
US_query = ("SELECT * FROM new_table WHERE country='United States of America'")
cursor.execute(US_query)
US_results = cursor.fetchall()

SK_query = ("SELECT * FROM new_table WHERE country='Korea (South)'")
cursor.execute(SK_query)
SK_results = cursor.fetchall()

IN_query = ("SELECT * FROM new_table WHERE country='India'")
cursor.execute(IN_query)
IN_results = cursor.fetchall()

NI_query = ("SELECT * FROM new_table WHERE country='Nicaragua'")
cursor.execute(NI_query)
NI_results = cursor.fetchall()

print(US_results[0])

#cursor.close();

(352, 'United States of America', '', 1, datetime.date(2020, 1, 22))
def getCasesFromData(dataset):
    dates = []
    for d in dataset:
        #print(d)
        dates.append(d[3])
        
    return dates
        
def getDatesFromData(dataset):
    dates = []
    for d in dataset:
        #print(d)
        dates.append(d[4])
        
    return dates
import datetime
from matplotlib.dates import drange

US_cases = getCasesFromData(US_results)
US_dates = getDatesFromData(US_results)

SK_cases = getCasesFromData(SK_results)
SK_dates = getDatesFromData(SK_results)

IN_cases = getCasesFromData(IN_results)
IN_dates = getDatesFromData(IN_results)

NI_cases = getCasesFromData(NI_results)
NI_dates = getDatesFromData(NI_results)

print(len(US_dates), len(SK_dates), len(IN_dates))
print(len(US_cases), len(SK_cases), len(IN_cases))
120 120 120
120 120 120
# Data for plotting

plt.subplots(figsize=(15,6))
plt.grid()
plt.plot(US_dates, US_cases, label="United States")
plt.plot(IN_dates, IN_cases, label="India")
plt.plot(US_dates, SK_cases, label="South Korea")
plt.plot(NI_dates, NI_cases, label="Nicaragua")

plt.legend()

plt.show()

png

#connection.close()