{ "cells": [ { "cell_type": "markdown", "id": "7f10fef2-a7bf-4349-9c98-76944f5f681d", "metadata": {}, "source": [ "# Get MIT Living Wage by Metropolitan Statistical Area (MSA)" ] }, { "cell_type": "markdown", "id": "d172fcc1-d54b-471b-943f-73ef30b7d60c", "metadata": {}, "source": [ "This script retrieves the living wage by family type for cities in the United States. The living wage is based on the methodology established for the MIT Living Wage calculator, which is available for free at: https://livingwage.mit.edu/" ] }, { "cell_type": "markdown", "id": "9a9ce9b6-d311-4e08-9f88-765a2eb2d0ba", "metadata": {}, "source": [ "### Step 1: Import required libraries." ] }, { "cell_type": "code", "execution_count": 2, "id": "ccdf51bb-0a43-4685-8d7a-215d2f330922", "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import csv" ] }, { "cell_type": "markdown", "id": "52372c2c-f6be-4e30-82d6-6997934202e5", "metadata": {}, "source": [ "### Step 2: Define FIPS codes and state names." ] }, { "cell_type": "code", "execution_count": 4, "id": "234a3835-c377-45e5-ba74-705230ab613c", "metadata": {}, "outputs": [], "source": [ "state_fips = {\n", " \"01\": \"Alabama\", \"02\": \"Alaska\", \"04\": \"Arizona\", \"05\": \"Arkansas\", \"06\": \"California\", \n", " \"08\": \"Colorado\", \"09\": \"Connecticut\", \"10\": \"Delaware\", \"12\": \"Florida\", \"13\": \"Georgia\", \n", " \"15\": \"Hawaii\", \"16\": \"Idaho\", \"17\": \"Illinois\", \"18\": \"Indiana\", \"19\": \"Iowa\", \"20\": \"Kansas\", \n", " \"21\": \"Kentucky\", \"22\": \"Louisiana\", \"23\": \"Maine\", \"24\": \"Maryland\", \"25\": \"Massachusetts\", \n", " \"26\": \"Michigan\", \"27\": \"Minnesota\", \"28\": \"Mississippi\", \"29\": \"Missouri\", \"30\": \"Montana\", \n", " \"31\": \"Nebraska\", \"32\": \"Nevada\", \"33\": \"New Hampshire\", \"34\": \"New Jersey\", \"35\": \"New Mexico\", \n", " \"36\": \"New York\", \"37\": \"North Carolina\", \"38\": \"North Dakota\", \"39\": \"Ohio\", \"40\": \"Oklahoma\", \n", " \"41\": \"Oregon\", \"42\": \"Pennsylvania\", \"44\": \"Rhode Island\", \"45\": \"South Carolina\", \"46\": \"South Dakota\", \n", " \"47\": \"Tennessee\", \"48\": \"Texas\", \"49\": \"Utah\", \"50\": \"Vermont\", \"51\": \"Virginia\", \"53\": \"Washington\", \n", " \"54\": \"West Virginia\", \"55\": \"Wisconsin\", \"56\": \"Wyoming\"\n", "}" ] }, { "cell_type": "markdown", "id": "8a3d90d2-b4bc-4cc8-b0f9-3f2e7d640c17", "metadata": {}, "source": [ "### Step 3: Define base URL for retrieving the data." ] }, { "cell_type": "code", "execution_count": 6, "id": "b56cd310-e252-43f5-b101-f9e6eb56865e", "metadata": {}, "outputs": [], "source": [ "base_url = \"https://livingwage.mit.edu/\"" ] }, { "cell_type": "markdown", "id": "5a88193d-0981-4cf5-82f3-6d9f8895f29f", "metadata": {}, "source": [ "### Step 4: Set headers to simulate a browser request." ] }, { "cell_type": "code", "execution_count": 8, "id": "d64c1b9b-d00a-4f98-9352-3df8adb94ceb", "metadata": {}, "outputs": [], "source": [ "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'\n", "}" ] }, { "cell_type": "markdown", "id": "6c9d183b-2ef7-437d-94ea-f23bbbacd12a", "metadata": {}, "source": [ "### Step 5: Define function to parse currency values and remove dollar signs." ] }, { "cell_type": "code", "execution_count": 10, "id": "ccea6fd4-71fd-46b0-b267-c019a8770657", "metadata": {}, "outputs": [], "source": [ "def parse_currency(value):\n", " try:\n", " return float(value.replace('$', '').replace(',', ''))\n", " except ValueError:\n", " return None" ] }, { "cell_type": "markdown", "id": "63a4af0c-363e-4185-b4f8-4f8cc2c76f1d", "metadata": {}, "source": [ "### Step 6: Define function to retrive living wage data." ] }, { "cell_type": "code", "execution_count": 12, "id": "4cb1d9fe-a008-48df-badb-a1d12ab2c2d7", "metadata": {}, "outputs": [], "source": [ "def get_all_annual_wages(url):\n", " try:\n", " response = requests.get(url, headers=headers)\n", " response.raise_for_status()\n", " soup = BeautifulSoup(response.text, 'html.parser')\n", " tables = soup.find_all('table')\n", "\n", " if not tables:\n", " print(f\"No tables found at {url}\")\n", " return {}\n", "\n", " # We know the data is in the third row (index 2)\n", " rows = tables[0].find_all('tr')\n", " if len(rows) < 3:\n", " print(f\"Expected row not found in table at {url}\")\n", " return {}\n", "\n", " wage_row = rows[2] # Third row\n", " cols = wage_row.find_all(['td', 'th'])\n", "\n", " # Expected 12 columns of wage data starting from index 1\n", " if len(cols) < 13:\n", " print(f\"Not enough columns in wage row at {url}\")\n", " return {}\n", "\n", " # Define keys in the exact order they appear in the table\n", " keys = [\n", " \"1adult_0children\", \"1adult_1child\", \"1adult_2children\", \"1adult_3children\",\n", " \"2adults1working_0children\", \"2adults1working_1child\", \"2adults1working_2children\", \"2adults1working_3children\",\n", " \"2adults2working_0children\", \"2adults2working_1child\", \"2adults2working_2children\", \"2adults2working_3children\"\n", " ]\n", "\n", " wages = {}\n", " for i in range(12):\n", " value = parse_currency(cols[i + 1].get_text(strip=True)) # +1 to skip label column\n", " wages[keys[i]] = round(value * 2080, 2) if value is not None else \"N/A\"\n", "\n", " return wages\n", "\n", " except Exception as e:\n", " print(f\"Error processing URL {url}: {e}\")\n", " return {}" ] }, { "cell_type": "markdown", "id": "acc03684-0617-4c4c-9bca-9cde6b2d4ccc", "metadata": {}, "source": [ "### Step 7: Define expected wage keys." ] }, { "cell_type": "code", "execution_count": 14, "id": "e354506e-8248-4e04-81f8-4f1c7beb185e", "metadata": {}, "outputs": [], "source": [ "expected_wage_keys = []\n", "family_types = ['1adult', '2adults1working', '2adults2working']\n", "children = ['0children', '1child', '2children', '3children']\n", "for f in family_types:\n", " for c in children:\n", " expected_wage_keys.append(f\"{f}_{c}\")" ] }, { "cell_type": "markdown", "id": "8cb27f19-15a1-4946-97dc-30abf837ea9b", "metadata": {}, "source": [ "### Step 8: Initialize data storage." ] }, { "cell_type": "code", "execution_count": 16, "id": "24952862-34a0-482e-89d6-f4b29dc3486c", "metadata": {}, "outputs": [], "source": [ "metro_data = []" ] }, { "cell_type": "markdown", "id": "9e76a71b-5063-4d63-9281-6afa2119cc07", "metadata": {}, "source": [ "### Step 9: Loop through states to retrieve living wage data." ] }, { "cell_type": "code", "execution_count": null, "id": "99d92f9e-ecf8-4754-b6c6-ec738d91e6a0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing state: Alabama (01)...\n", "Finished processing state: Alabama\n", "Processing state: Alaska (02)...\n", "Finished processing state: Alaska\n", "Processing state: Arizona (04)...\n", "Finished processing state: Arizona\n", "Processing state: Arkansas (05)...\n", "Finished processing state: Arkansas\n", "Processing state: California (06)...\n", "Finished processing state: California\n", "Processing state: Colorado (08)...\n", "Finished processing state: Colorado\n", "Processing state: Connecticut (09)...\n", "Finished processing state: Connecticut\n", "Processing state: Delaware (10)...\n", "Finished processing state: Delaware\n", "Processing state: Florida (12)...\n", "Finished processing state: Florida\n", "Processing state: Georgia (13)...\n", "Finished processing state: Georgia\n", "Processing state: Hawaii (15)...\n", "Finished processing state: Hawaii\n", "Processing state: Idaho (16)...\n", "Finished processing state: Idaho\n", "Processing state: Illinois (17)...\n", "Finished processing state: Illinois\n", "Processing state: Indiana (18)...\n", "Finished processing state: Indiana\n", "Processing state: Iowa (19)...\n", "Finished processing state: Iowa\n", "Processing state: Kansas (20)...\n", "Finished processing state: Kansas\n", "Processing state: Kentucky (21)...\n", "Finished processing state: Kentucky\n", "Processing state: Louisiana (22)...\n", "Finished processing state: Louisiana\n", "Processing state: Maine (23)...\n", "Finished processing state: Maine\n", "Processing state: Maryland (24)...\n", "Finished processing state: Maryland\n", "Processing state: Massachusetts (25)...\n", "Finished processing state: Massachusetts\n", "Processing state: Michigan (26)...\n", "Finished processing state: Michigan\n", "Processing state: Minnesota (27)...\n", "Finished processing state: Minnesota\n", "Processing state: Mississippi (28)...\n", "Finished processing state: Mississippi\n", "Processing state: Missouri (29)...\n", "Finished processing state: Missouri\n", "Processing state: Montana (30)...\n", "Finished processing state: Montana\n", "Processing state: Nebraska (31)...\n", "Finished processing state: Nebraska\n", "Processing state: Nevada (32)...\n", "Finished processing state: Nevada\n", "Processing state: New Hampshire (33)...\n", "Finished processing state: New Hampshire\n", "Processing state: New Jersey (34)...\n", "Finished processing state: New Jersey\n", "Processing state: New Mexico (35)...\n", "Finished processing state: New Mexico\n", "Processing state: New York (36)...\n", "Finished processing state: New York\n", "Processing state: North Carolina (37)...\n", "Finished processing state: North Carolina\n", "Processing state: North Dakota (38)...\n", "Finished processing state: North Dakota\n", "Processing state: Ohio (39)...\n", "Finished processing state: Ohio\n", "Processing state: Oklahoma (40)...\n", "Finished processing state: Oklahoma\n", "Processing state: Oregon (41)...\n", "Finished processing state: Oregon\n", "Processing state: Pennsylvania (42)...\n", "Finished processing state: Pennsylvania\n", "Processing state: Rhode Island (44)...\n", "Finished processing state: Rhode Island\n", "Processing state: South Carolina (45)...\n", "Finished processing state: South Carolina\n", "Processing state: South Dakota (46)...\n", "Finished processing state: South Dakota\n", "Processing state: Tennessee (47)...\n", "Finished processing state: Tennessee\n", "Processing state: Texas (48)...\n", "Finished processing state: Texas\n", "Processing state: Utah (49)...\n", "Finished processing state: Utah\n", "Processing state: Vermont (50)...\n", "Finished processing state: Vermont\n", "Processing state: Virginia (51)...\n", "Finished processing state: Virginia\n", "Processing state: Washington (53)...\n", "Finished processing state: Washington\n", "Processing state: West Virginia (54)...\n", "Finished processing state: West Virginia\n", "Processing state: Wisconsin (55)...\n" ] } ], "source": [ "for state_code, state_name in state_fips.items():\n", " print(f\"Processing state: {state_name} ({state_code})...\")\n", "\n", " state_url = f\"{base_url}states/{state_code}/locations\"\n", " state_response = requests.get(state_url, headers=headers)\n", "\n", " if state_response.status_code == 200:\n", " state_soup = BeautifulSoup(state_response.content, 'html.parser')\n", " msa_section = state_soup.find('div', class_=\"container\")\n", "\n", " if msa_section:\n", " msa_links = msa_section.find_all('a', href=True)\n", "\n", " for msa_link in msa_links:\n", " if 'metros' in msa_link['href']:\n", " msa_url = msa_link['href']\n", " full_msa_url = f\"{base_url}{msa_url}\"\n", " msa_name = msa_link.text.strip()\n", " msa_name_label = msa_name[:-4].strip() if len(msa_name) > 4 else msa_name\n", "\n", " wage_data = get_all_annual_wages(full_msa_url)\n", "\n", " row = [\n", " state_code, state_name, msa_name, msa_name_label, full_msa_url\n", " ]\n", " for key in expected_wage_keys:\n", " row.append(wage_data.get(key, \"N/A\"))\n", "\n", " metro_data.append(row)\n", " else:\n", " print(f\"No MSA section found for {state_name}\")\n", " else:\n", " print(f\"Failed to retrieve state page: {state_url}\")\n", "\n", " print(f\"Finished processing state: {state_name}\")\n" ] }, { "cell_type": "markdown", "id": "b733f7da-bf44-4a6d-b462-50400c7fbdea", "metadata": {}, "source": [ "### Step 10: Write data to a CSV file." ] }, { "cell_type": "code", "execution_count": 23, "id": "e68408a2-dd9d-4341-a002-32df9a9d5820", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CSV file 'mit_living_wage_by_msa.csv' has been created.\n" ] } ], "source": [ "header = [\n", " \"State Code\", \"State Name\", \"MSA Name\", \"MSA Name Label\", \"MSA URL\"\n", "] + expected_wage_keys\n", "\n", "with open('mit_living_wage_by_msa.csv', 'w', newline='', encoding='utf-8') as file:\n", " writer = csv.writer(file)\n", " writer.writerow(header)\n", "\n", " for row in metro_data:\n", " writer.writerow(row)\n", "\n", "print(\"CSV file 'mit_living_wage_by_msa.csv' has been created.\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:base] *", "language": "python", "name": "conda-base-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }