feat: 🌱 Seed Command

commit 218ce70 Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com> Date: Thu Mar 7 18:11:19 2024 -0500 feat: 🌱 created a seed command generates 3 new leaderboard documents commit 200565e Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com> Date: Thu Mar 7 18:05:39 2024 -0500 feat: ✨ modified commands using subparsers commit e3a0ff5 Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com> Date: Thu Mar 7 15:38:26 2024 -0500 style: 🎨 Formatted Code commit 970b95a Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com> Date: Thu Mar 7 15:19:50 2024 -0500 refactor: ♻️ updated magic constants in `shared` Constants: `MAX_NUM_OF_TEAMS`, `DECIMALS`, `ROOT_FOLDER_PATH` commit 957213d Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com> Date: Thu Mar 7 15:03:20 2024 -0500 perf: ⚡️ reduced pandas imports commit 86453b6 Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com> Date: Thu Mar 7 14:51:07 2024 -0500 docs: 📝 updated instructions and commands in README added `--dev` to install develop packages commit 7dab886 Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com> Date: Thu Mar 7 14:40:50 2024 -0500 chore: ➕ installed `pyarrow` as pandas depenceny `pyarrow` will be a future dependency for pandas: pandas-dev/pandas#54466
theDS3 · Mar 7, 2024 · 1871492 · 1871492
1 parent df2643d
commit 1871492
Show file tree

Hide file tree

Showing 6 changed files with 354 additions and 188 deletions.
diff --git a/Pipfile b/Pipfile
@@ -12,6 +12,7 @@ pymongo = "*"
 
 [dev-packages]
 tabulate = "*"
+pyarrow = "*"
 
 [requires]
 python_version = "3.12"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Creates a server to update DS3 Datathon Leaderboard using Kaggle Leaderboard
       Read [below](#custom-kaggle-api)
 
     ```bash
-    pipenv install
+    pipenv install --dev
     ```
 
 2. Copy the `.env.example` to `.env`:
@@ -33,12 +33,14 @@ Creates a server to update DS3 Datathon Leaderboard using Kaggle Leaderboard
 
 All commands are run from the root of the project, from a terminal:
 
-| Command              | Action                                         |
-| -------------------- | ---------------------------------------------- |
-| `pipenv install`     | Installs packages to virtual environment       |
-| `pipenv run dev`     | Starts local dev server at `0.0.0.0:8000`      |
-| `pipenv run start`   | Creates a production server at `0.0.0.0:8000`  |
+| Command                | Action                                        |
+| ---------------------- | ----------------------------------------------|
+| `pipenv install`       | Installs default packages                     |
+| `pipenv install --dev` | Installs develop and default packages         |
+| `pipenv run dev`       | Starts local dev server at `0.0.0.0:8000`     |
+| `pipenv run start`     | Creates a production server at `0.0.0.0:8000` |
 
 Check the [`pipenv`](https://pipenv.pypa.io/en/latest/cli.html) docs for more commands
 
-Checkout the [Wiki](https://github.com/theDS3/Datathon-Leaderboard/wiki) for details about build images and other topics
+Checkout the [Wiki](https://github.com/theDS3/Datathon-Leaderboard/wiki) for
+details about build images and other topics
diff --git a/src/leaderboard.py b/src/leaderboard.py
@@ -4,8 +4,13 @@
 from dotenv import dotenv_values
 from tabulate import tabulate
 
+from numpy.random import randint, uniform
+from pandas import DataFrame
+
 from shared import (
-    ROOT_PATH,
+    ROOT_FOLDER_PATH,
+    MAX_NUM_OF_TEAMS,
+    DECIMALS,
     get_abs_file_paths,
     process_csv,
     process_competitions,
@@ -21,53 +26,122 @@ class Secrets(TypedDict):
 config: Secrets = dotenv_values(".env")
 
 if len(config) == 0:
-    raise RuntimeError('Missing .env file')
+    raise RuntimeError("Missing .env file")
 
 parser = ArgumentParser(
     prog="leaderboard.py",
-    description="Create Private & Final Leaderboard Entry",
+    description="Add Leaderboard Entry",
     epilog="Update Datathon Leaderboard",
 )
 
-parser.add_argument(
+subparsers = parser.add_subparsers(dest="command")
+
+# Seed command
+seed_parser = subparsers.add_parser("seed", help="Generate Mock Standings")
+seed_parser.add_argument(
+    "num_of_teams",
+    metavar="teams-size",
+    type=int,
+    nargs="?",
+    default=MAX_NUM_OF_TEAMS,
+    help=f"Seed value (default: {MAX_NUM_OF_TEAMS})",
+)
+
+# Show command
+show_parser = subparsers.add_parser("show", help="Display the latest Standing")
+show_parser.add_argument(
     "type",
     metavar="type",
     type=str,
+    choices={"public", "private", "final"},
     help="Leaderboard Entry Type",
-    choices={"private", "final"},
 )
 
-parser.add_argument(
-    "-p",
-    "--publish",
-    help="Publish the latest standing",
-    action="store_true",
-    default=False,
+# Publish command
+publish_parser = subparsers.add_parser("publish", help="Publish the latest Standing")
+publish_parser.add_argument(
+    "type",
+    metavar="type",
+    type=str,
+    choices={"public", "private", "final"},
+    help="Leaderboard Entry Type",
 )
 
 args = parser.parse_args()
 
-PRIVATE_PATH = f"{ROOT_PATH}/private"
-
 client = MongoClient(config.get("MONGO_URI"))
 db = client.get_database(config.get("MONGO_DB"))
 leaderboard_col = db.get_collection("leaderboard")
 
-csv_files = get_abs_file_paths(PRIVATE_PATH, ".csv")
-df = process_csv(csv_files)
-standings = process_competitions(df, args.type, leaderboard_col)
-
-if len(standings) == 0:
-    raise ValueError(f"Review the CSV files in {PRIVATE_PATH}")
-
-
-print(f"Latest Standing for {args.type.capitalize()} Leaderboard")
-headers = [x.capitalize() for x in standings[0].keys()]
-rows = [x.values() for x in standings]
-print(tabulate(rows, headers, tablefmt="mixed_outline", numalign='left'))
-
-if args.publish:
-    update_leaderboard(standings, args.type, leaderboard_col)
-    print(f"Updated {args.type.capitalize()} Leaderboard on {db} Database")
+if args.command == "seed":
+    name = list(map(lambda x: f"Team-{x}", range(1, args.num_of_teams + 1)))
+    attempts = randint(low=1, high=40, size=args.num_of_teams)
+
+    score = lambda high=100: uniform(low=0, high=high, size=(args.num_of_teams, 3)).sum(
+        axis=1
+    )
+    delta = lambda: list(
+        map(
+            lambda x: "-" if x == 0 else f"+{x}" if x > 0 else x,
+            randint(-12, 12, size=args.num_of_teams),
+        )
+    )
+
+    for leaderboard_type in ["public", "private", "final"]:
+        df = DataFrame(
+            {
+                "name": name,
+                "score": score(100 if leaderboard_type != "final" else 94),
+                "attempts": attempts,
+                "delta": delta(),
+            }
+        )
+
+        if leaderboard_type == "final":
+            df = df.assign(
+                bonus=uniform(0.1, 2.0, size=args.num_of_teams)
+                ** randint(1, 4, size=args.num_of_teams)
+            )
+            df = df.assign(finalScore=df["bonus"] * df["score"])
+
+            df.sort_values(
+                by=["finalScore", "score", "attempts"],
+                inplace=True,
+                ascending=[False, False, True],
+            )
+            df["bonus"] = df["bonus"].astype(float).round(DECIMALS)
+            df["score"] = df["score"].astype(float).round(DECIMALS)
+            df["finalScore"] = df["finalScore"].astype(float).round(DECIMALS)
+        else:
+            df.sort_values(
+                by=["score", "attempts"], inplace=True, ascending=[False, True]
+            )
+            df["score"] = df["score"].astype(float).round(DECIMALS)
+
+        update_leaderboard(df.to_dict("records"), leaderboard_type, leaderboard_col)
+        print(f"Added Mock Entry for {leaderboard_type.capitalize()} Leaderboard on {db.name} Database")
+
+elif args.command in ['show', 'publish']:
+
+    PRIVATE_PATH = f"{ROOT_FOLDER_PATH}/private"
+
+    csv_files = get_abs_file_paths(PRIVATE_PATH, ".csv")
+    df = process_csv(csv_files)
+    standings = process_competitions(df, args.type, leaderboard_col)
+
+    if len(standings) == 0:
+        if args.type != 'public':
+            raise ValueError(f"Review the CSV files in {PRIVATE_PATH}")
+        else:
+            raise ValueError(f"Review the Competition on Kaggle")
+
+    print(f"Latest Standing for {args.type.capitalize()} Leaderboard")
+    headers = [x.capitalize() for x in standings[0].keys()]
+    rows = [x.values() for x in standings]
+    print(tabulate(rows, headers, tablefmt="mixed_outline", numalign='left'))
+
+    if args.command == 'publish':
+        update_leaderboard(standings, args.type, leaderboard_col)
+        print(f"Updated {args.type.capitalize()} Leaderboard on {db.name} Database")
 
 client.close()
diff --git a/src/main.py b/src/main.py
@@ -10,8 +10,8 @@
 from pydantic import BaseModel
 
 from src.shared import (
-    ROOT_PATH,
-    MAX_TEAMS,
+    ROOT_FOLDER_PATH,
+    MAX_NUM_OF_TEAMS,
     get_abs_file_paths,
     process_csv,
     process_competitions,
@@ -20,12 +20,12 @@
 
 class Request(BaseModel):
     competitions: list[str]
-    numOfTeams: int = MAX_TEAMS
+    numOfTeams: int = MAX_NUM_OF_TEAMS
 
 
 app = FastAPI()
 
-PUBLIC_PATH = f"{ROOT_PATH}/public"
+PUBLIC_PATH = f"{ROOT_FOLDER_PATH}/public"
 
 def download_competitions(competitions: list[str]) -> list[str]:
     """Downloads and Extracts leaderboard information from multiple competitions

diff --git a/src/shared.py b/src/shared.py
@@ -1,11 +1,11 @@
-from typing import TypedDict, NotRequired, Literal, Dict
+from typing import TypedDict, NotRequired, Literal
 from pymongo.collection import Collection
 
 from datetime import datetime
 from pytz import timezone
+from pandas import DataFrame, read_csv, merge
 
 import os
-import pandas as pd
 
 type LeaderboardType = Literal["public", "private", "final"]
 
@@ -20,9 +20,13 @@ class LeaderboardEntry(TypedDict):
 
 
 # Maximum number of teams to be displayed
-MAX_TEAMS = 40
+MAX_NUM_OF_TEAMS = 40
 
-ROOT_PATH = "data"
+# Maximum decimal places for rounding
+DECIMALS = 5
+
+# Root Folder
+ROOT_FOLDER_PATH = "data"
 
 
 def get_abs_file_paths(
@@ -62,7 +66,7 @@ def get_abs_file_paths(
     return files if not is_sorted else sorted(files)
 
 
-def process_csv(csv_file_paths: list[str]) -> pd.DataFrame:
+def process_csv(csv_file_paths: list[str]) -> DataFrame:
     """Extract leaderboard information from *.csv
 
     Args:
@@ -72,13 +76,13 @@ def process_csv(csv_file_paths: list[str]) -> pd.DataFrame:
         ValueError: `InvalidFile`
 
     Returns:
-        pd.DataFrame: Kaggle API Data with Scores and Counts for each Team from all competitions
+        DataFrame: Kaggle API Data with Scores and Counts for each Team from all competitions
     """
 
-    df = pd.DataFrame()
+    df = DataFrame()
     for csv_file_path in csv_file_paths:
         # Read the CSV files and filter specific columns
-        df_comp = pd.read_csv(csv_file_path).filter(
+        df_comp = read_csv(csv_file_path).filter(
             ["TeamName", "Score", "SubmissionCount"]
         )
 
@@ -95,24 +99,24 @@ def process_csv(csv_file_paths: list[str]) -> pd.DataFrame:
         ]
 
         # Join the multiple csv from
-        df = df_comp if df.empty else pd.merge(df, df_comp, on=["name"], how="outer")
+        df = df_comp if df.empty else merge(df, df_comp, on=["name"], how="outer")
 
     return df
 
 
 def process_competitions(
-    df: pd.DataFrame,
+    df: DataFrame,
     leaderboard_type: LeaderboardType,
     coll: Collection,
-    size=MAX_TEAMS,
+    size=MAX_NUM_OF_TEAMS,
 ) -> list[LeaderboardEntry]:
     """Process the leaderboard data from multiple competitions
 
     Args:
-        df (pd.DataFrame): Kaggle API Data with Scores and Counts for each Team from all competitions.
+        df (DataFrame): Kaggle API Data with Scores and Counts for each Team from all competitions.
         leaderboard_type (LeaderboardType): Specific mode of calculations
         coll (Collection):  Collection that contains the leaderboard snapshots.
-        size (int, optional): Limits number of teams. Defaults to `MAX_TEAMS`.
+        size (int, optional): Limits number of teams. Defaults to `MAX_NUM_OF_TEAMS`.
 
     Raises:
         ValueError: `InvalidCollection`
@@ -166,14 +170,14 @@ def process_competitions(
         df["bonus"] = 1.0
         df.set_index("name", inplace=True)
 
-        MAPPING_CSV = f"{ROOT_PATH}/final/mapping.csv"
-        BONUS_PATH = f"{ROOT_PATH}/final/bonus"
+        MAPPING_CSV = f"{ROOT_FOLDER_PATH}/final/mapping.csv"
+        BONUS_PATH = f"{ROOT_FOLDER_PATH}/final/bonus"
 
         if not os.path.isfile(MAPPING_CSV):
             raise ValueError(f"FileDoesNotExist: {MAPPING_CSV}")
 
         # Get mapping
-        df_mapping = pd.read_csv(MAPPING_CSV)
+        df_mapping = read_csv(MAPPING_CSV)
 
         # Get a list of all CSV files in the folder
         csv_files = get_abs_file_paths(BONUS_PATH, ".csv")
@@ -185,10 +189,10 @@ def process_competitions(
         for csv_file in csv_files:
 
             # Load the CSV file into a dataframe
-            df_attendance = pd.read_csv(csv_file)
+            df_attendance = read_csv(csv_file)
 
             # Subsetting attendance
-            merged_df = pd.merge(df_mapping, df_attendance, on="Email", how="inner")
+            merged_df = merge(df_mapping, df_attendance, on="Email", how="inner")
 
             # Populating with 1's
             merged_df["Attendance"] = 1
@@ -221,9 +225,9 @@ def process_competitions(
         df.reset_index(inplace=True, drop=False)
 
         # round
-        df["bonus"] = df["bonus"].astype(float).round(5)
-        df["score"] = df["score"].astype(float).round(5)
-        df["finalScore"] = df["finalScore"].astype(float).round(5)
+        df["bonus"] = df["bonus"].astype(float).round(DECIMALS)
+        df["score"] = df["score"].astype(float).round(DECIMALS)
+        df["finalScore"] = df["finalScore"].astype(float).round(DECIMALS)
 
         compare_leaderboard_type = "private"
         curr_standings = df[:size][
@@ -236,7 +240,7 @@ def process_competitions(
         df.reset_index(inplace=True, drop=True)
 
         # round
-        df["score"] = df["score"].astype(float).round(5)
+        df["score"] = df["score"].astype(float).round(DECIMALS)
 
         curr_standings = df[:size][["name", "score", "attempts"]].to_dict("records")