feat: initial dataloading
finaized the data grabbing, sampling, and loading code, along with defining the schema for ft
This commit is contained in:
@@ -1,74 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "05c0de49-0eb8-4cd2-89b2-c312111dda85",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install redis hiredis fastid faker tqdm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "90326242-f141-4035-9053-d3aab6cc9224",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from redis.cluster import RedisCluster\n",
|
||||
"\n",
|
||||
"client = RedisCluster(host=\"redis\", port=6379)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "50b9f473-8924-4d03-acf8-71ecf25e54a8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from fastid import ulid\n",
|
||||
"from faker import Faker\n",
|
||||
"from tqdm.notebook import trange\n",
|
||||
"faker = Faker()\n",
|
||||
"\n",
|
||||
"with client.pipeline() as p:\n",
|
||||
" for _ in trange(15_000_000):\n",
|
||||
" p.hset(f\"profile:{ulid()}\", mapping={\"name\": \"John Doe\", \"sex\": \"M\"})\n",
|
||||
" p.execute()\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "23933aa3-ff5d-45f9-b402-59e58c02a2b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "05c0de49-0eb8-4cd2-89b2-c312111dda85",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: redis in /opt/conda/lib/python3.13/site-packages (7.4.0)\n",
|
||||
"Requirement already satisfied: hiredis in /opt/conda/lib/python3.13/site-packages (3.3.1)\n",
|
||||
"Requirement already satisfied: fastid in /opt/conda/lib/python3.13/site-packages (0.0.5)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.13/site-packages (4.67.3)\n",
|
||||
"Requirement already satisfied: kagglehub in /opt/conda/lib/python3.13/site-packages (1.0.0)\n",
|
||||
"Requirement already satisfied: kagglesdk<1.0,>=0.1.14 in /opt/conda/lib/python3.13/site-packages (from kagglehub) (0.1.16)\n",
|
||||
"Requirement already satisfied: packaging in /opt/conda/lib/python3.13/site-packages (from kagglehub) (26.0)\n",
|
||||
"Requirement already satisfied: pyyaml in /opt/conda/lib/python3.13/site-packages (from kagglehub) (6.0.3)\n",
|
||||
"Requirement already satisfied: requests in /opt/conda/lib/python3.13/site-packages (from kagglehub) (2.32.5)\n",
|
||||
"Requirement already satisfied: protobuf in /opt/conda/lib/python3.13/site-packages (from kagglesdk<1.0,>=0.1.14->kagglehub) (6.33.5)\n",
|
||||
"Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.4.6)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.11)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2.6.3)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2026.2.25)\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install redis hiredis fastid tqdm kagglehub"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "90326242-f141-4035-9053-d3aab6cc9224",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from redis.cluster import RedisCluster\n",
|
||||
"from redis.commands.search.field import TextField, NumericField, TagField\n",
|
||||
"from redis.commands.search.index_definition import IndexDefinition, IndexType\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"client = RedisCluster(host=\"redis\", port=6379, username=\"admin\", password=\"admin\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "50b9f473-8924-4d03-acf8-71ecf25e54a8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "887890574a2048a587b6ed4dd8eef7c6",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/1000000 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from fastid import ulid\n",
|
||||
"from tqdm.notebook import trange\n",
|
||||
"\n",
|
||||
"with client.pipeline() as p:\n",
|
||||
" for _ in trange(1_000_000):\n",
|
||||
" p.hset(f\"profile:{ulid()}\", mapping={\"name\": \"John Doe\", \"sex\": \"M\"})\n",
|
||||
" p.execute()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "23933aa3-ff5d-45f9-b402-59e58c02a2b3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"b'OK'"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"schema = (\n",
|
||||
" TextField(\"name\"),\n",
|
||||
" TagField(\"sex\")\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"created_idx = client.ft(\"idx:profile\").create_index(\n",
|
||||
" schema,\n",
|
||||
" definition=IndexDefinition(\n",
|
||||
" prefix=[\"profile:\"], index_type=IndexType.HASH\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,240 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "071f6969-c01a-4369-a763-871f5b9e65b3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: redis in /opt/conda/lib/python3.13/site-packages (7.4.0)\n",
|
||||
"Requirement already satisfied: hiredis in /opt/conda/lib/python3.13/site-packages (3.3.1)\n",
|
||||
"Requirement already satisfied: fastid in /opt/conda/lib/python3.13/site-packages (0.0.5)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.13/site-packages (4.67.3)\n",
|
||||
"Requirement already satisfied: kagglehub in /opt/conda/lib/python3.13/site-packages (1.0.0)\n",
|
||||
"Requirement already satisfied: kagglesdk<1.0,>=0.1.14 in /opt/conda/lib/python3.13/site-packages (from kagglehub) (0.1.16)\n",
|
||||
"Requirement already satisfied: packaging in /opt/conda/lib/python3.13/site-packages (from kagglehub) (26.0)\n",
|
||||
"Requirement already satisfied: pyyaml in /opt/conda/lib/python3.13/site-packages (from kagglehub) (6.0.3)\n",
|
||||
"Requirement already satisfied: requests in /opt/conda/lib/python3.13/site-packages (from kagglehub) (2.32.5)\n",
|
||||
"Requirement already satisfied: protobuf in /opt/conda/lib/python3.13/site-packages (from kagglesdk<1.0,>=0.1.14->kagglehub) (6.33.5)\n",
|
||||
"Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.4.6)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.11)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2.6.3)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2026.2.25)\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install redis hiredis fastid tqdm kagglehub"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "526cb932-59aa-489e-bbba-954ac645b633",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from redis.cluster import RedisCluster\n",
|
||||
"from redis.cache import CacheConfig\n",
|
||||
"from redis.retry import Retry\n",
|
||||
"from redis.backoff import ExponentialBackoff\n",
|
||||
"from pathlib import Path\n",
|
||||
"import kagglehub\n",
|
||||
"from csv import DictReader\n",
|
||||
"from fastid import ulid\n",
|
||||
"from tqdm.notebook import tqdm\n",
|
||||
"\n",
|
||||
"retry = Retry(ExponentialBackoff(), 8)\n",
|
||||
"client = RedisCluster(\n",
|
||||
" host=\"redis\",\n",
|
||||
" port=6379,\n",
|
||||
" username=\"admin\",\n",
|
||||
" password=\"admin\",\n",
|
||||
" retry=retry,\n",
|
||||
" protocol=3,\n",
|
||||
" cache_config=CacheConfig(),\n",
|
||||
" decode_responses=True,\n",
|
||||
" health_check_interval=3,\n",
|
||||
" \n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "8d0875b5-f004-44fc-9519-89fc73c42b77",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/awesomizer/rainbox-six-siege-dataset/versions/1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset = Path(kagglehub.dataset_download(\"awesomizer/rainbox-six-siege-dataset\"))\n",
|
||||
"\n",
|
||||
"print(\"Path to dataset files:\", dataset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "be4c5461-1db2-4226-b71e-cdc7f06615bd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"full_dump = dataset/\"datadump_S5\"/\"datadump_S5.csv\"\n",
|
||||
"objectives = dataset/\"datadump_S5_summary_objectives\"/\"datadump_S5_summary_objectives.csv\"\n",
|
||||
"operator_loadouts = dataset/\"dataDump_s5_summary_operator_loadout\"/\"dataDump_s5_summary_operator_loadout.csv\"\n",
|
||||
"display(full_dump.exists())\n",
|
||||
"display(objectives.exists())\n",
|
||||
"display(operator_loadouts.exists())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "2efb5699-cced-4df0-9d76-278e23874436",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with full_dump.open(\"rb\") as f:\n",
|
||||
" rows = sum(1 for line in f) - 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9177760b-eec4-4ab4-8242-0dd8df5db9dd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"daily_matches = {}\n",
|
||||
"with full_dump.open(errors=\"ignore\") as csvfile:\n",
|
||||
" reader = DictReader(csvfile, delimiter=\";\")\n",
|
||||
" for i, row in enumerate(tqdm(reader, desc=\"Importing raw data into Redis...\", total=rows, unit=\"row\")):\n",
|
||||
" if row[\"dateid\"] not in daily_matches: # There's a ton of data so we just select two matches per day\n",
|
||||
" daily_matches[row[\"dateid\"]] = set((row[\"matchid\"],))\n",
|
||||
" display(f\"Selected {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n",
|
||||
" if row[\"matchid\"] not in daily_matches[row[\"dateid\"]]: # Rows from a given day that aren't related to a specific match are ignored\n",
|
||||
" if len(daily_matches[row[\"dateid\"]]) >= 2: # Skip only if we already have two matches picked\n",
|
||||
" continue\n",
|
||||
" daily_matches[row[\"dateid\"]].add(row[\"matchid\"])\n",
|
||||
" display(f\"Selected {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n",
|
||||
" client.hset(f\"raw:{ulid()}\", mapping=row)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d8b75e35-8bef-4b2b-ae25-1982076c73fd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from redis.commands.search.field import TagField, NumericField, TextField\n",
|
||||
"from redis.commands.search.index_definition import IndexDefinition, IndexType\n",
|
||||
"\n",
|
||||
"schema = (\n",
|
||||
" NumericField(\"dateid\", sortable=True, no_index=True),\n",
|
||||
" TagField(\"platform\"),\n",
|
||||
" TagField(\"gamemode\"),\n",
|
||||
" TagField(\"mapname\"),\n",
|
||||
" NumericField(\"matchid\"),\n",
|
||||
" NumericField(\"roundnumber\", sortable=True, no_index=True),\n",
|
||||
" TagField(\"objectivelocation\"),\n",
|
||||
" TagField(\"winrole\"),\n",
|
||||
" TagField(\"endroundreason\"),\n",
|
||||
" NumericField(\"roundduration\", sortable=True, no_index=True),\n",
|
||||
" NumericField(\"clearancelevel\", sortable=True, no_index=True),\n",
|
||||
" TagField(\"skillrank\"),\n",
|
||||
" TagField(\"role\"),\n",
|
||||
" NumericField(\"team\"),\n",
|
||||
" NumericField(\"haswon\"),\n",
|
||||
" TagField(\"operator\"),\n",
|
||||
" NumericField(\"nbkills\", sortable=True, no_index=True),\n",
|
||||
" NumericField(\"isdead\"),\n",
|
||||
" TagField(\"primaryweapon\"),\n",
|
||||
" TagField(\"primaryweapontype\"),\n",
|
||||
" TagField(\"primarysight\"),\n",
|
||||
" TagField(\"primarygrip\"),\n",
|
||||
" TagField(\"primaryunderbarrel\"),\n",
|
||||
" TagField(\"primarybarrel\"),\n",
|
||||
" TagField(\"secondaryweapon\"),\n",
|
||||
" TagField(\"secondaryweapontype\"),\n",
|
||||
" TagField(\"secondarysight\"),\n",
|
||||
" TagField(\"secondarygrip\"),\n",
|
||||
" TagField(\"secondaryunderbarrel\"),\n",
|
||||
" TagField(\"secondarybarrel\"),\n",
|
||||
" TagField(\"secondarygadget\")\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"client.ft().create_index(schema, definition=IndexDefinition(prefix=(\"raw:\",), index_type=IndexType.HASH))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "015ac04d-5f92-4323-8814-40ec35384a3c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user