# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress
import json
import datetime
# Import API key
from api_keys import weather_api_key
# Incorporated citipy to determine city based on latitude and longitude
from citipy import citipy
# Output File (CSV)
output_data_file = "output_data/cities.csv"
# Range of latitudes and longitudes
lat_range = (-90, 90)
lng_range = (-180, 180)
# List for holding lat_lngs and cities
lat_lngs = []
cities = []
# Create a set of random lat and lng combinations
lats = np.random.uniform(lat_range[0], lat_range[1], size=1500)
lngs = np.random.uniform(lng_range[0], lng_range[1], size=1500)
lat_lngs = zip(lats, lngs)
saved_lats = []
saved_longs = []
# Identify nearest city for each lat, lng combination
for lat_lng in lat_lngs:
city = citipy.nearest_city(lat_lng[0], lat_lng[1]).city_name
# If the city is unique, then add it to a our cities list
#also add estimated latitude and longitude
if city not in cities:
cities.append(city)
saved_lats.append(lat_lng[0])
saved_longs.append(lat_lng[1])
# print(citipy.nearest_city(0, 0).keys())
# Print the city count to confirm sufficient count
print("{} cities have been identified to poll for weather".format(len(cities)))
city_list = pd.DataFrame({"City":cities, "Approx. Latitude":saved_lats, "Approx. Longitude":saved_longs})
# city_list = pd.DataFrame({"City":cities})
#save to a csv (for reference - can compare estimated coordinates to actual coordinates if there is time, which cannot be done
#easily with citipy because the references only go one way -> approx. coordinates to city name, not city name to actual
#coordinates)
city_list.to_csv("./CityNameList.csv")
city_list.head()
#url for advanced mapping using OpenWeatherMap's new OneCall API to get historical weather info by coordinates instead of
#city names - not implemented for project initially due to setup being different
# base_url = "https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&exclude={part}&appid={YOUR API KEY}"
# params = {"exclude":"minutely,hourly,current", "appid":weather_api_key, "lat":city_list["Latitude"][0],
# "lon":city_list["Longitude"][0]}
#setup columns for temperature, humidity, cloudiness, and wind speed
city_list["Max Temperature (F)"] = ""
city_list["Humidity (%)"] = ""
city_list["Cloudiness"] = ""
city_list["Wind Speed (mph)"] = ""
#setup columns for actual latitude and longitude
city_list["Actual Latitude"] = ""
city_list["Actual Longitude"] = ""
#setup column for country code
city_list["Country"] = ""
#setup column for city datetime
city_list["Datetime (Local)"] = ""
#note that current timezone is PST (-7:00 from GMT)
PST_offset = 7*60*60
base_url_current = "http://api.openweathermap.org/data/2.5/weather"
params = {"q":city_list["City"][0], "units":"imperial", "appid":weather_api_key}
test_response = requests.get(base_url_current, params)
json_response = test_response.json()
# json.dumps(json_response, indent=4)
#establish pattern for getting data into columns from json object
city_list["Actual Latitude"][0] = json_response["coord"]["lat"]
city_list["Actual Longitude"][0] = json_response["coord"]["lon"]
city_list["Max Temperature (F)"][0] = json_response["main"]["temp_max"]
city_list["Humidity (%)"][0] = json_response["main"]["humidity"]
city_list["Cloudiness"][0] = json_response["clouds"]["all"]
city_list["Wind Speed (mph)"][0] = json_response["wind"]["speed"]
city_list["Country"][0] = json_response["sys"]["country"]
#create datetime.datetime object to give local date and time at the city of interest (need to add an offset from our
#computer clock set to PST and then add in the timezone shift given by the API to change from GMT)
first_city_time = datetime.datetime.fromtimestamp(json_response["dt"]+json_response["timezone"]+PST_offset)
city_list["Datetime (Local)"][0] = first_city_time.strftime("%m/%d/%Y, %H:%M:%S")
#create helper function to give the Datetime (UTC) column string
def getTimeString(resp_json, comp_time):
curr_city_datetime = datetime.datetime.fromtimestamp(resp_json["dt"]+resp_json["timezone"]+comp_time)
return curr_city_datetime.strftime("%m/%d/%Y, %H:%M:%S")
#create function to add all info to dataframe using the above pattern
def addColumns(pd_df, resp_json, i, comp_time):
pd_df["Actual Latitude"][i] = resp_json["coord"]["lat"]
pd_df["Actual Longitude"][i] = resp_json["coord"]["lon"]
pd_df["Max Temperature (F)"][i] = resp_json["main"]["temp_max"]
pd_df["Humidity (%)"][i] = resp_json["main"]["humidity"]
pd_df["Cloudiness"][i] = resp_json["clouds"]["all"]
pd_df["Wind Speed (mph)"][i] = resp_json["wind"]["speed"]
pd_df["Country"][i] = resp_json["sys"]["country"]
#create datetime.datetime object to give local date and time at the city of interest (need to add an offset from our
#computer clock set to PST and then add in the timezone shift given by the API to change from GMT)
#change it to a string to keep the timestamp the same in case it is examined later
pd_df["Datetime (Local)"][i] = getTimeString(resp_json, comp_time)
#loop over rows in dataframe to gather info for each city's weather
#put in its own cell to allow running separately from the test/setup of the API calls
record_count = len(city_list)
for i, row in city_list.iterrows():
#use try/except construct to skip over missing cities or missing info
try:
params["q"] = row["City"]
response = requests.get(base_url_current, params)
json_resp = response.json()
addColumns(city_list, json_resp, i, PST_offset)
print("Retrieved record {} of {} for the city of {}".format(i+1, record_count, row["City"]))
except:
city_list["City"][i] = "Failed!!!"
print("Could not retrieve record {} of {} for the city of {}".format(i+1, record_count, row["City"]))
city_list.head()
remaining_cities = city_list.loc[city_list["City"] != "Failed!!!"]
city_left_count = len(remaining_cities)
print("There is data for {} cities out of the {} cities identified, for a retrieval success rate of {:.1%}"
.format(city_left_count, record_count, (city_left_count/record_count)))
remaining_cities.index = [a for a in range(city_left_count)]
#must change all the columns other than the three below to numeric types -
#they are currently strings!
non_numeric_cols = ["City", "Country", "Datetime (Local)"]
for col in remaining_cities.columns:
if(col not in non_numeric_cols):
remaining_cities[col] = pd.to_numeric(remaining_cities[col])
remaining_cities.dtypes
remaining_cities
Skip this step if there are no cities that have humidity > 100%.
#check if there are any cities with >100% humidity
test_stats = remaining_cities["Humidity (%)"].describe(include = "all")
print("By inspection, the maximum humidity value is 100%, so there are no values greater than 100% in our data!")
test_stats
# remaining_cities["Humidity (%)"].value_counts()
# Get the indices of cities that have humidity over 100%.
# Make a new DataFrame equal to the city data to drop all humidity outliers by index.
# Passing "inplace=False" will make a copy of the city_data DataFrame, which we call "clean_city_data".
# Extract relevant fields from the data frame
# Export the City_Data into a csv
remaining_cities.to_csv("../output_data/Retrieved_City_Weather_Data.csv")
remaining_cities
lat_vs_T_axes = remaining_cities.plot(kind="scatter", x="Actual Latitude", y="Max Temperature (F)",
title="Maximum Temperature (F) vs. City Latitude")
lat_vs_T_axes.get_figure().savefig("../output_data/Temp_vs_Latitude_All_Cities.png")
The Temperature (F) vs. Latitude plot shown above, which includes data from all cities, peaks at approximately 20-degrees latitude. The latitude values below that latitude show a possible linear relationship with temperature, and the latitude values above that latitude also show a possible linear relationship with temperature, with both trends ending at the same peak temperature and latitude.
lat_vs_humidity_axes = remaining_cities.plot(kind="scatter", x="Actual Latitude", y="Humidity (%)",
title="Humidity (%) vs. City Latitude")
lat_vs_humidity_axes.get_figure().savefig("../output_data/Humidity_vs_Latitude_All_Cities.png")
The Humidity vs. Latitude plot shown above, which contains data from all cities, shows a relatively spread-out distribution of data points. There does not appear to be a relationship between humidity and latitude at first glance, but the humidity percentages seem to largely cluster above 50%.
lat_vs_cloudiness_axes = remaining_cities.plot(kind="scatter", x="Actual Latitude", y="Cloudiness",
title="Cloudiness vs. City Latitude")
lat_vs_cloudiness_axes.get_figure().savefig("../output_data/Cloudiness_vs_Latitude_All_Cities.png")
The Cloudiness vs. Latitude plot shown above, containing data from all cities, does not seem to show a relationship between cloudiness and city latitude. There seem to be some values for cloudiness that the points cluster along, noticeably 0, 20, 40, 75, 90, and 100, so those values may be rounded or categorized from some of the sources, and not represent continuous variables or measurements.
lat_vs_wind_speed_axes = remaining_cities.plot(kind="scatter", x="Actual Latitude", y="Wind Speed (mph)",
title="Wind Speed (mph) vs. City Latitude")
lat_vs_wind_speed_axes.get_figure().savefig("../output_data/Wind_Speed_vs_Latitude_All_Cities.png")
The Wind speed vs. Latitude plot, shown above for all cities, does not seem to show any relationship between wind speed and latitude. Most of the wind speed values are below about 25 mph, with only a few values above that wind speed limit.
# OPTIONAL: Create a function to create Linear Regression plots
#takes in as arguments a DataFrame (df), the x-axis String column name (x_Col), and the y-axis String column name (y_Col)
#plots a scatter plot with the df[x_Col] on the x-axis and the df[y_Col] data on the y-axis, and the linear regression line
#prints information about the best fit line from the linear regression
#returns the axes object for the plot, so more changes can be made later if necessary
def makeLinRegression(df, x_Col, y_Col):
#first, perform the linear regression on the data
(regr_slope, regr_intercept, regr_r_val, _, _) = linregress(df[x_Col],
df[y_Col])
#setup the linear regression line x and y axes data points
regr_line_x = np.array([a for a in range(int(min(df[x_Col].values)), int(max(df[x_Col].values)+1))])
regr_line_y = regr_slope * regr_line_x + regr_intercept
#plot the scatterplot with the raw data first!
df_axes = df.plot(kind="scatter", x=x_Col, y=y_Col,
title="{} vs. {} ({})"
.format(y_Col, x_Col, remaining_cities["Datetime (Local)"].iloc[0].split(",")[0]))
#add the best fit lines to the plot as a solid red line
df_axes.plot(regr_line_x, regr_line_y, 'r')
#output best-fit line information
print("The equation of the best-fit linear regression line for this chart is y={:.2f}x+{:.2f}"
.format(regr_slope, regr_intercept))
print("The r-squared correlation coefficient for this line is {:.2f}".format(regr_r_val))
#return the axes object for the plots, in case any changes need to be made
return df_axes
# Create Northern and Southern Hemisphere DataFrames
#first, get a subset of all the cities that are on or north of the equator for the Northern Hemisphere DataFrame
northern_hemisphere = remaining_cities.loc[remaining_cities["Actual Latitude"] >= 0]
northern_cities_count = len(northern_hemisphere)
print("There are {} cities in the northern hemisphere out of {} cities total, representing {:.1%} of all cities in the data set."
.format(northern_cities_count, city_left_count, (northern_cities_count/city_left_count)))
#first, get a subset of all the cities that are south of the equator for the Southern Hemisphere DataFrame
southern_hemisphere = remaining_cities.loc[remaining_cities["Actual Latitude"] < 0]
southern_cities_count = len(southern_hemisphere)
print("There are {} cities in the southern hemisphere out of {} cities total, representing {:.1%} of all cities in the data set."
.format(southern_cities_count, city_left_count, (southern_cities_count/city_left_count)))
(n_temp_slope, n_temp_intercept, n_temp_r_val, _, _) = linregress(northern_hemisphere["Actual Latitude"],
northern_hemisphere["Max Temperature (F)"])
regr_line_x = np.array([a for a in range(80)])
regr_line_y = n_temp_slope * regr_line_x + n_temp_intercept
n_temp_axes = northern_hemisphere.plot(kind="scatter", x="Actual Latitude",
y="Max Temperature (F)",
title="Max Temperature (F) vs. Actual Latitude ({})".
format(remaining_cities["Datetime (Local)"].iloc[0].split(",")[0]))
n_temp_axes.plot(regr_line_x, regr_line_y, 'r')
print("The equation of the best-fit linear regression line for this chart is y={:.2f}x+{:.2f}"
.format(n_temp_slope, n_temp_intercept))
print("The r-squared correlation coefficient for this line is {:.2f}".format(n_temp_r_val))
n_temp_axes.get_figure().savefig("../output_data/Temp_vs_Latitude_N_Hemisphere.png")
The Max Temperature vs. Latitude plot for cities in the northern hemisphere is shown above. The max temperature appears show a strong negative linear relationship with latitudes in the northern hemisphere, with an r-value of -0.72 for the linear best-fit regression line.
s_temp_lat_axes = makeLinRegression(southern_hemisphere, "Actual Latitude", "Max Temperature (F)")
s_temp_lat_axes.get_figure().savefig("../output_data/Temp_vs_Latitude_S_Hemisphere.png")
The Max Temperature vs. Latitude plot for cities in the southern hemisphere is shown above. The max temperature appears show a strong positive linear relationship with latitudes in the southern hemisphere, with an r-value of 0.84 for the linear best-fit regression line.
n_humidity_lat_axes = makeLinRegression(northern_hemisphere, "Actual Latitude", "Humidity (%)")
n_humidity_lat_axes.get_figure().savefig("../output_data/Humidity_vs_Latitude_N_Hemisphere.png")
The Humidity vs. Latitude plot for cities in the northern hemisphere is shown above. The data seems to show no relationship between the humidity and latitude for the northern hemisphere.
s_humidity_lat_axes = makeLinRegression(southern_hemisphere, "Actual Latitude", "Humidity (%)")
s_humidity_lat_axes.get_figure().savefig("../output_data/Humidity_vs_Latitude_S_Hemisphere.png")
The Humidity vs. Latitude plot for cities in the southern hemisphere is shown above. The data seems to show no relationship between the humidity and latitude for the southern hemisphere.
n_cloudiness_lat_axes = makeLinRegression(northern_hemisphere, "Actual Latitude", "Cloudiness")
n_cloudiness_lat_axes.get_figure().savefig("../output_data/Cloudiness_vs_Latitude_N_Hemisphere.png")
The Cloudiness vs. Latitude plot for cities in the northern hemisphere is shown above. The data seems to show no relationship between cloudiness and latitude for the northern hemisphere.
s_cloudiness_lat_axes = makeLinRegression(southern_hemisphere, "Actual Latitude", "Cloudiness")
s_cloudiness_lat_axes.get_figure().savefig("../output_data/Cloudiness_vs_Latitude_S_Hemisphere.png")
The Cloudiness vs. Latitude plot for cities in the southern hemisphere is shown above. The data seems to show no relationship between cloudiness and latitude for the southern hemisphere.
n_wind_lat_axes = makeLinRegression(northern_hemisphere, "Actual Latitude", "Wind Speed (mph)")
n_wind_lat_axes.get_figure().savefig("../output_data/Wind_Speed_vs_Latitude_N_Hemisphere.png")
The Wind Speed vs. Latitude plot for cities in the northern hemisphere is shown above. The data seems to show no relationship between wind speed and latitude for the northern hemisphere.
s_wind_lat_axes=makeLinRegression(southern_hemisphere, "Actual Latitude", "Wind Speed (mph)")
s_wind_lat_axes.get_figure().savefig("../output_data/Wind_Speed_vs_Latitude_S_Hemisphere.png")
The Wind Speed vs. Latitude plot for cities in the southern hemisphere is shown above. The data seems to show no relationship between wind speed and latitude for the southern hemisphere.