Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,6 @@ backend/tests.http

# ML Models & Artifacts
*.joblib

# Environment Variables
.env
34 changes: 28 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ logs: ## Follow logs for all services
# ==============================================================================
# DEVELOPMENT & TESTING
# ==============================================================================
setup: up seed-db build-model ## Run this once to setup a new environment
@echo "$(GREEN)✅ Initial setup complete!$(RESET)"
setup: up seed-db build-model ## Run this once to setup a new LOCAL environment from scratch
@echo "$(GREEN)✅ Initial local setup complete! Database is seeded and model is built.$(RESET)"

seed-db: ## Run the database seed script (requires services to be up)
@echo "$(YELLOW)--> Seeding database...$(RESET)"
seed-db: ## Run the database seed script on the LOCAL docker DB
@echo "$(YELLOW)--> Seeding LOCAL database...$(RESET)"
@docker compose exec backend python seed_database.py

build-model: ## Run the ML model training script (requires services to be up)
build-model: ## Run the ML model training script
@echo "$(YELLOW)--> Building similarity model...$(RESET)"
@docker compose exec backend python build_similarity_model.py

Expand All @@ -57,4 +57,26 @@ test-backend: ## Run backend python tests

test-frontend: ## Run frontend javascript tests
@echo "$(GREEN)--> Running frontend tests...$(RESET)"
@docker compose run --rm frontend npm test -- --watchAll=false
@docker compose run --rm frontend npm test -- --watchAll=false

test-ci: ## Simulate the CI environment perfectly
@echo "$(YELLOW)--> Running tests in a clean CI-like environment...$(RESET)"
@docker run --rm \
-v "$(shell pwd)/backend":/app \
-w /app \
python:3.11-slim \
sh -c "pip install -r requirements.txt && pip install pytest && pytest"

# ==============================================================================
# PRODUCTION BUILDS
# ==============================================================================
build-frontend-prod: ## Build the production frontend image for deployment
@echo "$(YELLOW)--> Building production frontend image...$(RESET)"
@gcloud builds submit --config frontend/cloudbuild.yaml \
--substitutions=_API_BASE_URL=https://wnba-backend-service-776933261932.us-west1.run.app \
./frontend

# You could also add one for the backend for consistency
build-backend-prod: ## Build the production backend image for deployment
@echo "$(YELLOW)--> Building production backend image...$(RESET)"
@gcloud builds submit ./backend --tag gcr.io/wnba-analytics-prod/wnba-backend
20 changes: 13 additions & 7 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
# backend/Dockerfile
FROM python:3.11-slim

# --- Stage 1: The "base" stage ---
FROM python:3.11-slim AS base
WORKDIR /app

COPY requirements.txt .

RUN pip install --no-cache-dir -r requirements.txt

# --- Stage 2: The "development" stage ---
FROM base AS development
WORKDIR /app
COPY . .

# Expose the port the app runs on
EXPOSE 8000

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

# --- Stage 3: The "production" stage ---
FROM base AS production
WORKDIR /app
COPY . .
RUN python build_similarity_model.py
EXPOSE 8080
CMD uvicorn main:app --host 0.0.0.0 --port ${PORT:-8080}
83 changes: 39 additions & 44 deletions backend/build_similarity_model.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,56 @@
# backend/build_similarity_model.py

import logging
import pandas as pd
import joblib
from sqlalchemy import text
import json

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from database import SessionLocal, engine

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# The JSON files are now the source of truth for the build
DATA_FILES = [
'data/wnba_combined_2024.json',
]

def build_model():
logger.info("Connecting to database to fetch player stats...")
db = SessionLocal()

# Load all player stats into a Pandas DataFrame
# SQL query to join players and stats
query = """
SELECT
p.id as player_id,
p.first_name,
p.last_name,
ps.season,
ps.points_per_game,
ps.rebounds_per_game,
ps.assists_per_game,
ps.steals_per_game,
ps.blocks_per_game,
ps.field_goal_percentage,
ps.three_point_percentage,
ps.player_efficiency_rating
FROM players p
JOIN player_stats ps ON p.id = ps.player_id
"""
df = pd.read_sql(text(query), con=engine.connect())
db.close()
logger.info(f"Successfully loaded {len(df)} player seasons from the database.")

if df.empty:
logger.error("No data found in the database. Please run the seed script first.")
return

# Select features used for comparison
logger.info("Loading player data from JSON files...")

all_seasons_df = []
for file_name in DATA_FILES:
year = file_name.split('_')[2].split('.')[0]
with open(file_name, 'r') as f:
season_data = json.load(f)

# Add a 'season' column to each record
for record in season_data:
record['season'] = year

all_seasons_df.append(pd.DataFrame(season_data))

df = pd.concat(all_seasons_df, ignore_index=True)

# Clean player names and handle multi-team players ('TOT')
df['Player'] = df['Player'].str.replace('*', '', regex=False)
df = df[df['Team'] != 'TOT'] # Exclude total rows
df = df.dropna(subset=['Player']) # Drop rows with no player name

logger.info(f"Successfully loaded {len(df)} total player seasons.")

# Define the unique ID and the features for the model
df['player_season_id'] = df['Player'] + ' (' + df['season'] + ')'
features = [
'points_per_game', 'rebounds_per_game', 'assists_per_game',
'steals_per_game', 'blocks_per_game', 'field_goal_percentage',
'three_point_percentage', 'player_efficiency_rating'
'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'PER', 'WS'
]
# Ensure all feature columns exist and fill NaNs with 0
for feature in features:
if feature not in df.columns:
df[feature] = 0
df[features] = df[features].fillna(0)

# Unique identifier for each player-season
df['player_season_id'] = df['first_name'] + ' ' + df['last_name'] + ' (' + df['season'] + ')'
df_features = df.set_index('player_season_id')[features]

# Normalize the data
Expand All @@ -59,8 +59,6 @@ def build_model():
logger.info("Features have been scaled.")

# Calculate Cosine Similarity
# This creates a big matrix where every player-season is compared to every other one.
# The result is a score from 0 (completely different) to 1 (identical).
similarity_matrix = cosine_similarity(scaled_features)
logger.info("Similarity matrix has been calculated.")

Expand All @@ -69,9 +67,6 @@ def build_model():
joblib.dump(similarity_matrix, 'similarity_matrix.joblib')

logger.info("Model artifacts have been saved successfully!")
logger.info("-> similarity_data.joblib (Player data and vectors)")
logger.info("-> similarity_matrix.joblib (Comparison scores)")


if __name__ == "__main__":
build_model()
40 changes: 29 additions & 11 deletions backend/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,38 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, declarative_base

# --- Check for a test-specific database URL first ---
# The getenv() function reads an environment variable.
# We provide a default value for local development.
DATABASE_URL = os.getenv(
"DATABASE_URL",
"postgresql://admin:password123@db/wnba_db"
)
def get_database_url():
"""
Gets the database URL from environment variables with a specific order of priority:
1. An explicit DATABASE_URL (used for testing).
2. Google Cloud Run's special socket connection (for production).
3. The local Docker PostgreSQL database (for development).
"""
# 1. Highest priority: Check for the testing URL.
if db_url := os.getenv("DATABASE_URL"):
return db_url

# 2. Second priority: Check if running in Google Cloud Run.
if os.getenv("K_SERVICE"):
db_user = os.environ["DB_USER"]
db_pass = os.environ["DB_PASS"]
db_name = os.environ["DB_NAME"]
db_socket_dir = "/cloudsql"
instance_connection_name = os.environ["INSTANCE_CONNECTION_NAME"]

# Return the Unix Socket connection string.
return (
f"postgresql+psycopg2://{db_user}:{db_pass}@/{db_name}"
f"?host={db_socket_dir}/{instance_connection_name}"
)

# 3. Default: Fall back to the local Docker database URL.
return "postgresql://admin:password123@db:5432/wnba_db"

DATABASE_URL = get_database_url()

# --- Use the DATABASE_URL variable ---
engine = create_engine(DATABASE_URL)

# For SQLite, we need a special argument. We'll add it only if needed.
if "sqlite" in DATABASE_URL:
engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})

SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
3 changes: 2 additions & 1 deletion backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ async def lifespan(app: FastAPI):

app = FastAPI(lifespan=lifespan)

origins = ["http://localhost:3000"]
origins = ["http://localhost:3000",
"https://wnba-frontend-service-776933261932.us-west1.run.app"]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
Expand Down
39 changes: 25 additions & 14 deletions backend/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,39 @@
# backend/tests/conftest.py
import pytest
import os

# This tells our app to use the SQLite DB for all tests.
os.environ['DATABASE_URL'] = "sqlite:///./test.db"

from fastapi.testclient import TestClient
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

# Set the environment variable BEFORE other imports.
os.environ['DATABASE_URL'] = "sqlite:///./test.db"

from main import app, get_db
from database import Base, engine # We can now import the engine safely
from database import Base, engine

TestingSessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

# Override the get_db dependency to use the test database
app.dependency_overrides[get_db] = lambda: TestingSessionLocal()

@pytest.fixture(scope="function")
def db_session():
"""
This fixture creates a clean database with all tables for a single test function,
and then drops all tables after the test is done.
"""
Base.metadata.create_all(bind=engine)
yield TestingSessionLocal()
Base.metadata.drop_all(bind=engine)

@pytest.fixture(scope="function")
def test_client(db_session):
yield TestClient(app)
# This is the key: we override the app's dependency to use our
# clean, temporary database for the duration of the test.
def override_get_db():
try:
db = TestingSessionLocal()
yield db
finally:
db.close()

app.dependency_overrides[get_db] = override_get_db

# Yield nothing, just perform setup and teardown
yield

# Teardown: clean up the override and drop tables
del app.dependency_overrides[get_db]
Base.metadata.drop_all(bind=engine)
13 changes: 7 additions & 6 deletions backend/tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# backend/tests/test_main.py
from fastapi.testclient import TestClient
from main import app

def test_read_root(test_client):
"""
Tests if the root API endpoint ('/api') returns a successful response
and the expected JSON message.
"""
response = test_client.get("/api")
# This test doesn't need a database, so it doesn't need a fixture.
def test_read_root():
# It creates its own client.
client = TestClient(app)
response = client.get("/api")
assert response.status_code == 200
assert response.json() == {"message": "WNBA Analytics API is running!"}
Loading