Boost your business SQL skills. Whether you’re in marketing, finance, or product, knowing how to make data-driven decisions is the key to success. The more fluently you can retrieve and analyze your data, the quicker you’ll uncover actionable insights and grow your business. In this track, you’ll learn how to quickly explore and analyze data to help you make smarter business decisions. Through hands-on practice, you’ll learn everything from creating and joining tables to writing queries, subqueries, and aggregate functions, providing you with the skills you need to excel and overcome real-world business challenges.
目錄
- Exploratory Data Analysis in SQL
- Data-Driven Decision Making in SQL
- Applying SQL to Real-World Problems
- Analyzing Business Data in SQL
- Reporting in SQL
1. Exploratory Data Analysis in SQL
What’s in the database?
SELECT count(*) - count(ticker) AS missing
FROM fortune500;
SELECT company.name
-- Table(s) to select from
FROM company
INNER JOIN fortune500
ON company.ticker=fortune500.ticker;
SELECT *
FROM price;
column_1 | column_2 |
---|---|
10 | |
22 | |
3 | 4 |
SELECT coalesce(column_1, column_2)
FROM price;
coalesce |
---|
10 |
22 |
3 |
-- Count the number of tags with each type
SELECT type, count(*) AS count
FROM tag_type
-- To get the count for each type, what do you need to do?
GROUP BY type
-- Order the results with the most common
-- tag types listed first
ORDER BY count DESC;
-- Select the 3 columns desired
SELECT name, tag_type.tag, tag_type.type
FROM company
-- Join the tag_company and company tables
INNER JOIN tag_company
ON company.id = tag_company.company_id
-- Join the tag_type and company tables
INNER JOIN tag_type
ON tag_company.tag = tag_type.tag
-- Filter to most common type
WHERE type='cloud';
-- Select the 3 columns desired
-- Use coalesce
SELECT coalesce(fortune500.industry, fortune500.sector, 'Unknown') AS industry2,
-- Don't forget to count!
count(industry)
FROM fortune500
-- Group by what? (What are you counting by?)
GROUP BY industry2
-- Order results to see most common first
ORDER BY count DESC
-- Limit results to get just the one value you want
LIMIT 1;
SELECT company_original.name, title, rank
-- Start with original company information
FROM company AS company_original
-- Join to another copy of company with parent
-- company information
LEFT JOIN company AS company_parent
ON company_original.parent_id = company_parent.id
-- Join to fortune500, only keep rows that match
INNER JOIN fortune500
-- Use parent ticker if there is one,
-- otherwise original ticker
ON coalesce(company_original.ticker,
company_parent.ticker) =
fortune500.ticker
-- For clarity, order by rank
ORDER BY rank;
-- Select the original value
SELECT profits_change,
-- Cast profits_change
CAST(profits_change AS integer) AS profits_change_int
FROM fortune500;
- the first column will be 3; the second column will be 3.3333-
-- Divide 10 by 3
SELECT 10/3,
-- Cast 10 as numeric and divide by 3
10::numeric/3;
-- Select the count of each revenues_change integer value
SELECT revenues_change::integer, COUNT(revenues_change::integer)
FROM fortune500
GROUP BY revenues_change::integer
-- order by the values of revenues_change
ORDER BY revenues_change;
Summarizing and aggregating numeric data
-- Bins created in Step 2
WITH bins AS (
SELECT generate_series(2200, 3050, 50) AS lower,
generate_series(2250, 3100, 50) AS upper),
-- Subset stackoverflow to just tag dropbox (Step 1)
dropbox AS (
SELECT question_count
FROM stackoverflow
WHERE tag='dropbox')
-- Select columns for result
-- What column are you counting to summarize?
SELECT lower, upper, count(question_count)
FROM bins -- Created above
-- Join to dropbox (created above),
-- keeping all rows from the bins table in the join
LEFT JOIN dropbox
-- Compare question_count to lower and upper
ON question_count >= lower
AND question_count < upper
-- Group by lower and upper to count values in each bin
GROUP BY lower, upper
-- Order by lower to put bins in order
ORDER BY lower;
-- What groups are you computing statistics by?
SELECT sector,
-- Select the mean of assets with the avg function
AVG(assets) AS mean,
-- Select the median
percentile_disc(0.5) WITHIN GROUP (ORDER BY assets) AS median
FROM fortune500
-- Computing statistics for each what?
GROUP BY sector
-- Order results by a value of interest
ORDER BY mean;
-- Code from previous step
DROP TABLE IF EXISTS profit80;
CREATE TEMP TABLE profit80 AS
SELECT sector,
percentile_disc(0.8) WITHIN GROUP (ORDER BY profits) AS pct80
FROM fortune500
GROUP BY sector;
-- Select columns, aliasing as needed
SELECT title, fortune500.sector,
profits, profits/pct80 AS ratio
-- What tables do you need to join?
FROM fortune500
LEFT JOIN profit80
-- How are the tables joined?
ON fortune500.sector=profit80.sector
-- What rows do you want to select?
WHERE profits > pct80;
-- To clear table if it already exists
DROP TABLE IF EXISTS startdates;
CREATE TEMP TABLE startdates AS
SELECT tag, min(date) AS mindate
FROM stackoverflow
GROUP BY tag;
-- Select tag (Remember the table name!) and mindate
SELECT startdates.tag,
mindate,
-- Select question count on the min and max days
so_min.question_count AS min_date_question_count,
so_max.question_count AS max_date_question_count,
-- Compute the change in question_count (max- min)
so_max.question_count - so_min.question_count AS change
FROM startdates
-- Join startdates to stackoverflow with alias so_min
INNER JOIN stackoverflow AS so_min
-- What needs to match between tables?
ON startdates.tag = so_min.tag
AND startdates.mindate = so_min.date
-- Join to stackoverflow again with alias so_max
INNER JOIN stackoverflow AS so_max
-- Again, what needs to match between tables?
ON startdates.tag = so_max.tag
AND so_max.date = '2018-09-25';
DROP TABLE IF EXISTS correlations;
CREATE TEMP TABLE correlations AS
SELECT 'profits'::varchar AS measure,
corr(profits, profits) AS profits,
corr(profits, profits_change) AS profits_change,
corr(profits, revenues_change) AS revenues_change
FROM fortune500;
INSERT INTO correlations
SELECT 'profits_change'::varchar AS measure,
corr(profits_change, profits) AS profits,
corr(profits_change, profits_change) AS profits_change,
corr(profits_change, revenues_change) AS revenues_change
FROM fortune500;
INSERT INTO correlations
SELECT 'revenues_change'::varchar AS measure,
corr(revenues_change, profits) AS profits,
corr(revenues_change, profits_change) AS profits_change,
corr(revenues_change, revenues_change) AS revenues_change
FROM fortune500;
-- Select each column, rounding the correlations
SELECT measure,
round(profits::numeric, 2) AS profits,
round(profits_change::numeric, 2) AS profits_change,
round(revenues_change::numeric, 2) AS revenues_change
FROM correlations;
Exploring categorical data and unstructured text
-- Select the first 50 chars when length is greater than 50
SELECT CASE WHEN length(description) > 50
THEN left(description, 50) || '...'
-- otherwise just select description
ELSE description
END
FROM evanston311
-- limit to descriptions that start with the word I
WHERE description LIKE 'I %'
ORDER BY description;
-- Code from previous step
DROP TABLE IF EXISTS recode;
CREATE TEMP TABLE recode AS
SELECT DISTINCT category,
rtrim(split_part(category, '-', 1)) AS standardized
FROM evanston311;
UPDATE recode SET standardized='Trash Cart'
WHERE standardized LIKE 'Trash%Cart';
UPDATE recode SET standardized='Snow Removal'
WHERE standardized LIKE 'Snow%Removal%';
UPDATE recode SET standardized='UNUSED'
WHERE standardized IN ('THIS REQUEST IS INACTIVE...Trash Cart',
'(DO NOT USE) Water Bill',
'DO NOT USE Trash', 'NO LONGER IN USE');
-- Select the recoded categories and the count of each
SELECT standardized, COUNT(*)
-- From the original table and table with recoded values
FROM evanston311
LEFT JOIN recode
-- What column do they have in common?
ON evanston311.category = recode.category
-- What do you need to group by to count?
GROUP BY standardized
-- Display the most common val values first
ORDER BY COUNT DESC;
Create and store indicator variables for email and phone in a temporary table. LIKE
produces True or False as a result, but casting a boolean (True or False) as an integer
converts True to 1 and False to 0. This makes the values easier to summarize later.
-- To clear table if it already exists
DROP TABLE IF EXISTS indicators;
-- Create the temp table
CREATE TEMP TABLE indicators AS
SELECT id,
CAST (description LIKE '%@%' AS integer) AS email,
CAST (description LIKE '%___-___-____%' AS integer) AS phone
FROM evanston311;
-- Select the column you'll group by
SELECT priority,
-- Compute the proportion of rows with each indicator
SUM(email)/COUNT(*)::numeric AS email_prop,
SUM(phone)/COUNT(*)::numeric AS phone_prop
-- Tables to select from
FROM evanston311
LEFT JOIN indicators
-- Joining condition
ON evanston311.id=indicators.id
-- What are you grouping by?
GROUP BY priority;
Working with dates and timestamps
-- Select name of the day of the week the request was created
SELECT to_char(date_created, 'day') AS day,
-- Select avg time between request creation and completion
avg(date_completed - date_created) AS duration
FROM evanston311
-- Group by the name of the day of the week and
-- integer value of day of week the request was created
GROUP BY day, EXTRACT(DOW FROM date_created)
-- Order by integer value of the day of the week
-- the request was created
ORDER BY EXTRACT(DOW FROM date_created);
-- Aggregate daily counts by month
SELECT date_trunc('month', day) AS month,
avg(count)
-- Subquery to compute daily counts
FROM (SELECT date_trunc('day', date_created) AS day,
count(*) AS count
FROM evanston311
GROUP BY day) AS daily_count
GROUP BY month
ORDER BY month;
-- Compute monthly counts of requests created
WITH created AS (
SELECT date_trunc('month', date_created) AS month,
count(*) AS created_count
FROM evanston311
WHERE category='Rodents- Rats'
GROUP BY month),
-- Compute monthly counts of requests completed
completed AS (
SELECT date_trunc('month', date_completed) AS month,
count(*) AS completed_count
FROM evanston311
WHERE category='Rodents- Rats'
GROUP BY month)
-- Join monthly created and completed counts
SELECT created.month,
created_count,
completed_count
FROM created
INNER JOIN completed
ON created.month=completed.month
ORDER BY created.month;
2. Data-Driven Decision Making in SQL
Introduction to business intelligence for a online movie rental database
SELECT *
FROM movies
WHERE genre <> 'Drama'; -- All genres except drama
Decision Making with simple SQL queries
Data Driven Decision Making with advanced SQL queries
SELECT *
FROM customers as c -- Select all customers with at least one rating
WHERE EXISTS
(SELECT *
FROM renting AS r
WHERE rating IS NOT NULL
AND r.customer_id = c.customer_id);
Data Driven Decision Making with OLAP SQL queries
SELECT genre,
year_of_release,
COUNT(*)
FROM movies
GROUP BY CUBE (genre, year_of_release)
ORDER BY year_of_release;
-- Group by each county and genre with OLAP extension
SELECT
c.country,
m.genre,
AVG(r.rating) AS avg_rating,
COUNT(*) AS num_rating
FROM renting AS r
LEFT JOIN movies AS m
ON m.movie_id = r.movie_id
LEFT JOIN customers AS c
ON r.customer_id = c.customer_id
GROUP BY ROLLUP (c.country, m.genre)
ORDER BY c.country, m.genre;
SELECT
c.country,
c.gender,
AVG(r.rating)
FROM renting AS r
LEFT JOIN customers AS c
ON r.customer_id = c.customer_id
-- Report all info from a Pivot table for country and gender
GROUP BY GROUPING SETS ((country, gender), (country), (gender), ());
3. Applying SQL to Real-World Problems
Use Real-World SQL
EXTRACT(<part> FROM <date_column>)
- Use
STRING_AGG
forGROUP BY
SELECT name,
STRING_AGG(title, ',') AS film_titles
FROM film AS f
INNER JOIN language AS l
ON f.language_id = l.language_id
WHERE release_year = 2010
AND rating = 'G'
GROUP BY name;
Find Your Data
What tables are in my database?
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname = 'public';
A VIEW is a virtual table.
-- Create a new view called table_columns
CREATE VIEW table_columns AS
SELECT table_name,
STRING_AGG(column_name, ', ') AS columns
FROM information_schema.columns
WHERE table_schema = 'public'
GROUP BY table_name;
-- Query the newly created view table_columns
SELECT *
FROM table_columns;
Manage Your Data
Create a TABLE using new data
-- Create a new table called oscars
CREATE TABLE oscars (
title VARCHAR,
award VARCHAR
);
-- Insert the data into the oscars table
INSERT INTO oscars (title, award)
VALUES
('TRANSLATION SUMMER', 'Best Film'),
('DORADO NOTTING', 'Best Film'),
('MARS ROMAN', 'Best Film'),
('CUPBOARD SINNERS', 'Best Film'),
('LONELY ELEPHANT', 'Best Film');
-- Confirm the table was created and is populated
SELECT *
FROM oscars;
Create a TABLE using existing data
-- Create a new table named family_films using this query
CREATE TABLE family_films AS
SELECT *
FROM film
WHERE rating IN ('G', 'PG');
Be careful when modifying tables
UPDATE film
SET rental_rate = rental_rate-1
WHERE film_id IN
(SELECT film_id from actor AS a
INNER JOIN film_actor AS f
ON a.actor_id = f.actor_id
WHERE last_name IN ('WILLIS', 'CHASE', 'WINSLET', 'GUINESS', 'HUDSON'));
-- Use the list of film_id values to DELETE all R & NC-17 rated films from inventory.
DELETE FROM inventory
WHERE film_id IN (
SELECT film_id FROM film
WHERE rating IN ('R', 'NC-17')
);
-- Delete records from the `film` table that are either rated as R or NC-17.
DELETE FROM film
WHERE rating IN ('R', 'NC-17');
Best Practices for Writing SQL
ensure that the SQL scripts you write will be easy to understand!
4. Analyzing Business Data in SQL
Revenue, Cost, and Profit
DATE_TRUNC(date_part, date)
DATE_TRUNC('week', '2018-06-12') :: DATE
→'2018-06-11'
DATE_TRUNC('month', '2018-06-12') :: DATE
→'2018-06-01'
SELECT DATE_TRUNC('week', order_date) :: DATE AS delivr_week,
-- Calculate revenue
SUM(meal_price * order_quantity) AS revenue
FROM meals
JOIN orders ON meals.meal_id = orders.meal_id
-- Keep only the records in June 2018
WHERE DATE_TRUNC('month', order_date) = '2018-06-01'
GROUP BY delivr_week
ORDER BY delivr_week ASC;
-- Declare a CTE named monthly_cost
WITH monthly_cost AS (
SELECT
DATE_TRUNC('month', stocking_date)::DATE AS delivr_month,
SUM(meal_cost * stocked_quantity) AS cost
FROM meals
JOIN stock ON meals.meal_id = stock.meal_id
GROUP BY delivr_month)
SELECT
-- Calculate the average monthly cost before September
AVG(cost)
FROM monthly_cost
WHERE delivr_month < '2018-09-01';
WITH revenue AS (
-- Calculate revenue per eatery
SELECT eatery,
SUM(orders.order_quantity * meal_price) AS revenue
FROM meals
JOIN orders ON meals.meal_id = orders.meal_id
GROUP BY eatery),
cost AS (
-- Calculate cost per eatery
SELECT eatery,
SUM(stocked_quantity * meal_cost) AS cost
FROM meals
JOIN stock ON meals.meal_id = stock.meal_id
GROUP BY eatery)
-- Calculate profit per eatery
SELECT revenue.eatery,
(revenue - cost) AS profit
FROM revenue
JOIN cost ON revenue.eatery = cost.eatery
ORDER BY profit DESC;
-- Set up the revenue CTE
WITH revenue AS (
SELECT
DATE_TRUNC('month', order_date) :: DATE AS delivr_month,
SUM(meals.meal_price * orders.order_quantity) AS revenue
FROM meals
JOIN orders ON meals.meal_id = orders.meal_id
GROUP BY delivr_month),
-- Set up the cost CTE
cost AS (
SELECT
DATE_TRUNC('month', stocking_date) :: DATE AS delivr_month,
SUM(meals.meal_cost * stock.stocked_quantity) AS cost
FROM meals
JOIN stock ON meals.meal_id = stock.meal_id
GROUP BY delivr_month)
-- Calculate profit by joining the CTEs
SELECT
revenue.delivr_month,
(revenue - cost) AS profit
FROM revenue
JOIN cost ON revenue.delivr_month = cost.delivr_month
ORDER BY revenue.delivr_month ASC;
User-Centric KPIs
Monthly active users (MAU)
SELECT
-- Truncate the order date to the nearest month
DATE_TRUNC('month', order_date) :: DATE AS delivr_month,
-- Count the unique user IDs
COUNT(DISTINCT user_id) AS mau
FROM orders
GROUP BY delivr_month
-- Order by month
ORDER BY delivr_month ASC;
WITH mau AS (
SELECT
DATE_TRUNC('month', order_date) :: DATE AS delivr_month,
COUNT(DISTINCT user_id) AS mau
FROM orders
GROUP BY delivr_month)
SELECT
-- Select the month and the MAU
delivr_month,
mau,
COALESCE(
LAG(mau) OVER (ORDER BY delivr_month ASC),
0) AS last_mau
FROM mau
-- Order by month in ascending order
ORDER BY delivr_month ASC;
Growth rate
WITH orders AS (
SELECT
DATE_TRUNC('month', order_date) :: DATE AS delivr_month,
-- Count the unique order IDs
COUNT(DISTINCT order_id) AS orders
FROM orders
GROUP BY delivr_month),
orders_with_lag AS (
SELECT
delivr_month,
-- Fetch each month's current and previous orders
orders,
COALESCE(
LAG(orders) OVER (ORDER BY delivr_month ASC),
1) AS last_orders
FROM orders)
SELECT
delivr_month,
-- Calculate the MoM order growth rate
ROUND(
(orders - last_orders) :: numeric / last_orders,
2) AS growth
FROM orders_with_lag
ORDER BY delivr_month ASC;
Retention rate
WITH user_monthly_activity AS (
SELECT DISTINCT
DATE_TRUNC('month', order_date) :: DATE AS delivr_month,
user_id
FROM orders)
SELECT
-- Calculate the MoM retention rates
previous.delivr_month,
ROUND(
COUNT(DISTINCT current.user_id) :: NUMERIC /
GREATEST(COUNT(DISTINCT previous.user_id), 1),
2) AS retention_rate
FROM user_monthly_activity AS previous
LEFT JOIN user_monthly_activity AS current
-- Fill in the user and month join conditions
ON previous.user_id = current.user_id
AND previous.delivr_month = (current.delivr_month - INTERVAL '1 month')
GROUP BY previous.delivr_month
ORDER BY previous.delivr_month ASC;
ARPU, Histograms, and Percentiles
ARPU
WITH kpi AS (
SELECT
-- Select the week, revenue, and count of users
DATE_TRUNC('week', o.order_date) :: DATE AS delivr_week,
SUM(m.meal_price * o.order_quantity) AS revenue,
COUNT(DISTINCT o.user_id) AS users
FROM meals AS m
JOIN orders AS o ON m.meal_id = o.meal_id
GROUP BY delivr_week)
SELECT
delivr_week,
-- Calculate ARPU
ROUND(
revenue :: NUMERIC / GREATEST(users,1),
2) AS arpu
FROM kpi
-- Order by week in ascending order
ORDER BY delivr_week ASC;
Histogram
WITH user_revenues AS (
SELECT
-- Select the user ID and revenue
user_id,
SUM(o.order_quantity * m.meal_price) AS revenue
FROM meals AS m
JOIN orders AS o ON m.meal_id = o.meal_id
GROUP BY user_id)
SELECT
-- Return the frequency table of revenues by user
ROUND(revenue :: NUMERIC, -2) AS revenue_100,
COUNT(DISTINCT user_id) AS users
FROM user_revenues
GROUP BY revenue_100
ORDER BY revenue_100 ASC;
Bucketing users
WITH user_revenues AS (
SELECT
-- Select the user IDs and the revenues they generate
o.user_id,
SUM(m.meal_price * o.order_quantity) AS revenue
FROM meals AS m
JOIN orders AS o ON m.meal_id = o.meal_id
GROUP BY user_id)
SELECT
-- Fill in the bucketing conditions
CASE
WHEN revenue < 150 THEN 'Low-revenue users'
WHEN revenue < 300 THEN 'Mid-revenue users'
ELSE 'High-revenue users'
END AS revenue_group,
COUNT(DISTINCT user_id) AS users
FROM user_revenues
GROUP BY revenue_group;
Percentiles
WITH user_revenues AS (
-- Select the user IDs and their revenues
SELECT
user_id,
SUM(o.order_quantity*m.meal_price) AS revenue
FROM meals AS m
JOIN orders AS o ON m.meal_id = o.meal_id
GROUP BY user_id)
SELECT
-- Calculate the first, second, and third quartile
ROUND(
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY revenue ASC):: NUMERIC,
2) AS revenue_p25,
ROUND(
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY revenue ASC):: NUMERIC,
2) AS revenue_p50,
ROUND(
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY revenue ASC):: NUMERIC,
2) AS revenue_p75,
-- Calculate the average
ROUND(AVG(revenue) :: NUMERIC, 2) AS avg_revenue
FROM user_revenues;
Generating an Executive Report
Formatting dates
SELECT DISTINCT
-- Select the order date
order_date,
-- Format the order date
TO_CHAR(order_date, 'FMDay DD, FMMonth YYYY') AS format_order_date
FROM orders
ORDER BY order_date ASC
LIMIT 3;
CREATE EXTENSION
is like import
in Python
-- Import tablefunc
CREATE EXTENSION IF NOT EXISTS tablefunc;
SELECT * FROM CROSSTAB($$
SELECT
user_id,
DATE_TRUNC('month', order_date) :: DATE AS delivr_month,
SUM(meal_price * order_quantity) :: FLOAT AS revenue
FROM meals
JOIN orders ON meals.meal_id = orders.meal_id
WHERE user_id IN (0, 1, 2, 3, 4)
AND order_date < '2018-09-01'
GROUP BY user_id, delivr_month
ORDER BY user_id, delivr_month;
$$)
-- Select user ID and the months from June to August 2018
AS ct (user_id INT,
"2018-06-01" FLOAT,
"2018-07-01" FLOAT,
"2018-08-01" FLOAT)
ORDER BY user_id ASC;
-- Import tablefunc
CREATE EXTENSION IF NOT EXISTS tablefunc;
-- Pivot the previous query by quarter
SELECT * FROM CROSSTAB($$
WITH eatery_users AS (
SELECT
eatery,
-- Format the order date so "2018-06-01" becomes "Q2 2018"
TO_CHAR(order_date, '"Q"Q YYYY') AS delivr_quarter,
-- Count unique users
COUNT(DISTINCT user_id) AS users
FROM meals
JOIN orders ON meals.meal_id = orders.meal_id
GROUP BY eatery, delivr_quarter
ORDER BY delivr_quarter, users)
SELECT
-- Select eatery and quarter
eatery,
delivr_quarter,
-- Rank rows, partition by quarter and order by users
RANK() OVER
(PARTITION BY delivr_quarter
ORDER BY users DESC) :: INT AS users_rank
FROM eatery_users
ORDER BY eatery, delivr_quarter;
$$)
-- Select the columns of the pivoted table
AS ct (eatery TEXT,
"Q2 2018" INT,
"Q3 2018" INT,
"Q4 2018" INT)
ORDER BY "Q4 2018";
5. Reporting in SQL
Exploring the Olympics Dataset
- An E:R diagram visually shows all tables, fields, and relationships in a database.
Creating Reports
none
Cleaning & Validation
Identifying data types
-- Pull column_name & data_type from the columns table
SELECT
column_name,
data_type
FROM information_schema.columns
-- Filter for the table 'country_stats'
WHERE table_name = 'country_stats';
SELECT
-- Clean the country field to only show country_code
LEFT(REPLACE(UPPER(TRIM(c.country)), '.', ''), 3) AS country_code,
-- Pull in pop_in_millions and medals_per_million
pop_in_millions,
-- Add the three medal fields using one sum function
SUM(COALESCE(bronze,0) + COALESCE(silver,0) + COALESCE(gold,0)) AS medals,
SUM(COALESCE(bronze,0) + COALESCE(silver,0) + COALESCE(gold,0)) / CAST(cs.pop_in_millions AS float) AS medals_per_million
FROM summer_games AS s
JOIN countries AS c
ON s.country_id = c.id
-- Update the newest join statement to remove duplication
JOIN country_stats AS cs
ON s.country_id = cs.country_id AND s.year = CAST(cs.year AS date)
-- Filter out null populations
WHERE cs.pop_in_millions IS NOT NULL
GROUP BY c.country, pop_in_millions
-- Keep only the top 25 medals_per_million rows
ORDER BY medals_per_million DESC
LIMIT 25;
Complex Calculations
-- Query region, athlete name, and total_golds
SELECT
region,
athlete_name,
total_golds
FROM
(SELECT
-- Query region, athlete_name, and total gold medals
region,
name AS athlete_name,
SUM(gold) AS total_golds,
-- Assign a regional rank to each athlete
ROW_NUMBER() OVER (PARTITION BY region ORDER BY SUM(gold) DESC) AS row_num
FROM summer_games_clean AS s
JOIN athletes AS a
ON a.id = s.athlete_id
JOIN countries AS c
ON s.country_id = c.id
-- Alias as subquery
GROUP BY region, athlete_name) AS subquery
-- Filter for only the top athlete per region
WHERE row_num = 1;
-- Pull country_gdp by region and country
SELECT
region,
country,
SUM(gdp) AS country_gdp,
-- Calculate the global gdp
SUM(SUM(gdp)) OVER () AS global_gdp,
-- Calculate percent of global gdp
SUM(gdp) / SUM(SUM(gdp)) OVER () AS perc_global_gdp,
-- Calculate percent of gdp relative to its region
SUM(gdp) / SUM(SUM(gdp)) OVER (PARTITION BY region) AS perc_region_gdp
FROM country_stats AS cs
JOIN countries AS c
ON cs.country_id = c.id
-- Filter out null gdp values
WHERE gdp IS NOT NULL
GROUP BY region, country
-- Show the highest country_gdp at the top
ORDER BY country_gdp DESC;
SELECT
-- Pull in date and weekly_avg
date,
weekly_avg,
-- Output the value of weekly_avg from 7 days prior
LAG(weekly_avg,7) OVER (ORDER BY date) AS weekly_avg_previous,
-- Calculate percent change vs previous period
weekly_avg / (LAG(weekly_avg,7) OVER (ORDER BY date)) - 1 AS perc_change
FROM
(SELECT
-- Pull in date and daily_views
date,
SUM(views) AS daily_views,
-- Calculate the rolling 7 day average
AVG(SUM(views)) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS weekly_avg
FROM web_data
-- Alias as subquery
GROUP BY date) AS subquery
-- Order by date in descending order
ORDER BY date DESC;
SELECT
-- Pull in region and calculate avg tallest height
region,
AVG(height) AS avg_tallest,
-- Calculate region's percent of world gdp
SUM(SUM(GDP)) OVER (PARTITION BY region) / SUM(SUM(GDP)) OVER () AS perc_world_gdp
FROM countries AS c
JOIN
(SELECT
-- Pull in country_id and height
country_id,
height,
-- Number the height of each country's athletes
ROW_NUMBER() OVER (PARTITION BY country_id ORDER BY height DESC) AS row_num
FROM winter_games AS w
JOIN athletes AS a ON w.athlete_id = a.id
GROUP BY country_id, height
-- Alias as subquery
ORDER BY country_id, height DESC) AS subquery
ON c.id = subquery.country_id
-- Join to country_stats
JOIN country_stats AS cs
ON cs.country_id = c.id
-- Only include the tallest height for each country
WHERE row_num = 1
GROUP BY region;