diff --git a/notebooks/r6data.ipynb b/notebooks/r6_full.ipynb similarity index 96% rename from notebooks/r6data.ipynb rename to notebooks/r6_full.ipynb index cb2963f..1663b61 100644 --- a/notebooks/r6data.ipynb +++ b/notebooks/r6_full.ipynb @@ -94,12 +94,12 @@ " for i, row in enumerate(tqdm(reader, desc=\"Importing raw data into Redis...\", total=rows, unit=\"row\")):\n", " if row[\"dateid\"] not in daily_matches: # There's a ton of data so we just select two matches per day\n", " daily_matches[row[\"dateid\"]] = set((row[\"matchid\"],))\n", - " display(f\"Selected {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n", + " display(f\"Selected match {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n", " if row[\"matchid\"] not in daily_matches[row[\"dateid\"]]: # Rows from a given day that aren't related to a specific match are ignored\n", " if len(daily_matches[row[\"dateid\"]]) >= 20: # Skip only if we already have two matches picked\n", " continue\n", " daily_matches[row[\"dateid\"]].add(row[\"matchid\"])\n", - " display(f\"Selected {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n", + " display(f\"Selected match {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n", " _ = p.hset(f\"raw:{ulid()}\", mapping=row)\n", " _ = p.execute()" ] diff --git a/notebooks/r6_load_resampled.ipynb b/notebooks/r6_load_resampled.ipynb new file mode 100644 index 0000000..da46415 --- /dev/null +++ b/notebooks/r6_load_resampled.ipynb @@ -0,0 +1,237 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "071f6969-c01a-4369-a763-871f5b9e65b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: redis in /opt/conda/lib/python3.13/site-packages (7.4.0)\n", + "Requirement already satisfied: hiredis in /opt/conda/lib/python3.13/site-packages (3.3.1)\n", + "Requirement already satisfied: fastid in /opt/conda/lib/python3.13/site-packages (0.0.5)\n", + "Requirement already satisfied: tqdm in /opt/conda/lib/python3.13/site-packages (4.67.3)\n", + "Requirement already satisfied: kagglehub in /opt/conda/lib/python3.13/site-packages (1.0.0)\n", + "Requirement already satisfied: kagglesdk<1.0,>=0.1.14 in /opt/conda/lib/python3.13/site-packages (from kagglehub) (0.1.16)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.13/site-packages (from kagglehub) (26.0)\n", + "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.13/site-packages (from kagglehub) (6.0.3)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.13/site-packages (from kagglehub) (2.32.5)\n", + "Requirement already satisfied: protobuf in /opt/conda/lib/python3.13/site-packages (from kagglesdk<1.0,>=0.1.14->kagglehub) (6.33.5)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.4.6)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.11)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2.6.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2026.2.25)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install redis hiredis fastid tqdm kagglehub" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "526cb932-59aa-489e-bbba-954ac645b633", + "metadata": {}, + "outputs": [], + "source": [ + "from redis.cluster import RedisCluster\n", + "from redis.cache import CacheConfig\n", + "from redis.retry import Retry\n", + "from redis.backoff import ExponentialBackoff\n", + "from pathlib import Path\n", + "import kagglehub # pyright: ignore[reportMissingTypeStubs]\n", + "from csv import DictReader\n", + "from fastid import ulid\n", + "from tqdm.notebook import tqdm\n", + "\n", + "retry = Retry(ExponentialBackoff(), 8)\n", + "client = RedisCluster(\n", + " host=\"redis\",\n", + " port=6379,\n", + " username=\"admin\",\n", + " password=\"admin\",\n", + " retry=retry,\n", + " protocol=3,\n", + " cache_config=CacheConfig(),\n", + " decode_responses=True,\n", + " health_check_interval=3,\n", + " \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "be4c5461-1db2-4226-b71e-cdc7f06615bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Downsample of full dump present: True'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset = Path(kagglehub.dataset_download(\"awesomizer/rainbox-six-siege-dataset\"))\n", + "downsampled = dataset/\"downsampled_S5.csv\"\n", + "display(f\"Downsample of full dump present: {downsampled.exists()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2efb5699-cced-4df0-9d76-278e23874436", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2fa9c64b42394db7a5764896d7d936d0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with downsampled.open(\"rb\") as f:\n", + " rows = sum(1 for _ in tqdm(f)) - 1" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9177760b-eec4-4ab4-8242-0dd8df5db9dd", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f235da3945864d409e81037d42269756", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Importing data into Redis...: 0%| | 0/102698 [00:00=0.1.14 in /opt/conda/lib/python3.13/site-packages (from kagglehub) (0.1.16)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.13/site-packages (from kagglehub) (26.0)\n", + "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.13/site-packages (from kagglehub) (6.0.3)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.13/site-packages (from kagglehub) (2.32.5)\n", + "Requirement already satisfied: protobuf in /opt/conda/lib/python3.13/site-packages (from kagglesdk<1.0,>=0.1.14->kagglehub) (6.33.5)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.4.6)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.11)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2.6.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2026.2.25)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install tqdm kagglehub" + ] + }, + { + "cell_type": "markdown", + "id": "427a6e3e", + "metadata": {}, + "source": [ + "# R6 Data Resampling\n", + "\n", + "Due to the large volume of data in the original dataset (80M+ rows) we need to perform some resampling to:\n", + " - Shorten the amount of time spend loading data into Redis down to a reasonable time\n", + " - As Redis is a memory resident data store, everything needs to fit in memory - and the full dataset is fairly large\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ed9fdf86", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import kagglehub # pyright: ignore[reportMissingTypeStubs]\n", + "from csv import DictReader, DictWriter\n", + "from tqdm.notebook import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "586498d7", + "metadata": {}, + "source": [ + "First we fetch the dataset from kagglehub" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8e6a156d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/awesomizer/rainbox-six-siege-dataset/versions/1\n" + ] + } + ], + "source": [ + "dataset = Path(kagglehub.dataset_download(\"awesomizer/rainbox-six-siege-dataset\"))\n", + "\n", + "print(\"Path to dataset files:\", dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "a1e452e4", + "metadata": {}, + "source": [ + "Then we validate all the expected files are present, and count rows - the primary reason for this is displaying progress statistics during long running ops." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "30d83f96", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Full dump present: True'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Objective picks present: True'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Operator loadouts present: True'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "86eacf0882aa4337a32fc2ea893a4d18", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "full_dump = dataset/\"datadump_S5\"/\"datadump_S5.csv\"\n", + "objectives = dataset/\"datadump_S5_summary_objectives\"/\"datadump_S5_summary_objectives.csv\"\n", + "operator_loadouts = dataset/\"dataDump_s5_summary_operator_loadout\"/\"dataDump_s5_summary_operator_loadout.csv\"\n", + "display(f\"Full dump present: {full_dump.exists()}\")\n", + "display(f\"Objective picks present: {objectives.exists()}\")\n", + "display(f\"Operator loadouts present: {operator_loadouts.exists()}\")\n", + "\n", + "with full_dump.open(\"rb\") as f:\n", + " rows = sum(1 for _ in tqdm(f)) - 1" + ] + }, + { + "cell_type": "markdown", + "id": "83371b4f", + "metadata": {}, + "source": [ + "Now we perform the downsampling." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "477feac7", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5d34c1829a0b42649d675632071234a1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Importing raw data into Redis...: 0%| | 0/85939712 [00:00= 20: # Skip only if we already have two matches picked\n", + " continue\n", + " daily_matches[row[\"dateid\"]].add(row[\"matchid\"])\n", + " display(f\"Selected match {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n", + " writer.writerow(row)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}