{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Assemble WebSky background catalog with Dask\n" ], "id": "01fb53b775b0" }, { "cell_type": "code", "execution_count": 2, "id": "0ed31a46-d481-4958-af82-3889e2f6b80a", "metadata": { "tags": [] }, "outputs": [], "source": [ "import h5pickle as h5py\n", "import numpy as np\n", "import healpy as hp\n", "import matplotlib.pyplot as plt\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 3, "id": "4d97b61d", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "| Provider | Environment variable | Set? | Models |\n", "|----------|----------------------|------|--------|\n", "| `gemini` | `GOOGLE_API_KEY` | | |\n" ], "text/plain": [ "gemini\n", "Requires environment variable: GOOGLE_API_KEY (set)\n", "* gemini:gemini-1.0-pro\n", "* gemini:gemini-1.0-pro-001\n", "* gemini:gemini-1.0-pro-latest\n", "* gemini:gemini-1.0-pro-vision-latest\n", "* gemini:gemini-pro\n", "* gemini:gemini-pro-vision\n", "\n" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%load_ext jupyter_ai\n", "%ai list gemini" ] }, { "cell_type": "code", "execution_count": 4, "id": "6f9001d6", "metadata": {}, "outputs": [], "source": [ "#%%ai gemini:gemini-pro -f code\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "0dcee1ff", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import healpy as hp\n", "hp.version" ] }, { "cell_type": "code", "execution_count": 6, "id": "66420d6e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Created `%gm` as an alias for `%ai gemini:gemini-pro -f code`.\n", "Created `%%gm` as an alias for `%%ai gemini:gemini-pro -f code`.\n" ] } ], "source": [ "%alias_magic gm ai -p \"gemini:gemini-pro -f code\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "263a8761", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "num_threads = 128\n", "os.environ[\"OMP_NUM_THREADS\"] = \"1\"" ] }, { "cell_type": "code", "execution_count": 8, "id": "1b709764", "metadata": {}, "outputs": [], "source": [ "from dask.distributed import Client, LocalCluster\n", "\n", "cluster = LocalCluster(n_workers=num_threads, threads_per_worker=1, processes=True)\n", "client = Client(cluster)" ] }, { "cell_type": "code", "execution_count": 9, "id": "5f8de1f5", "metadata": {}, "outputs": [], "source": [ "cutoff_flux = 1e-3" ] }, { "cell_type": "code", "execution_count": 10, "id": "82d11cad", "metadata": {}, "outputs": [], "source": [ "plot = False" ] }, { "cell_type": "code", "execution_count": 11, "id": "8e35f4b9-bc39-49e9-af61-67464526f4d9", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/global/cfs/cdirs/sobs/www/users/Radio_WebSky/matched_catalogs_2\n" ] } ], "source": [ "cd /global/cfs/cdirs/sobs/www/users/Radio_WebSky/matched_catalogs_2" ] }, { "cell_type": "code", "execution_count": 12, "id": "d0653ac2-3d67-4480-849d-bcca2727a143", "metadata": { "tags": [] }, "outputs": [], "source": [ "cat = h5py.File(\"catalog_100.0.h5\", \"r\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "d06f8c9e", "metadata": {}, "outputs": [], "source": [ "#%%ai gemini:gemini-pro -f code\n", " \n", "#find the fields in a h5py File" ] }, { "cell_type": "code", "execution_count": 14, "id": "b02cba19", "metadata": {}, "outputs": [], "source": [ "import dask.array as da" ] }, { "cell_type": "markdown", "id": "80b4c835", "metadata": {}, "source": [ "There are no metadata in the file, I guess fluxes are in `Jy`" ] }, { "cell_type": "code", "execution_count": 15, "id": "886dbbaf-8890-449c-bab4-2f7eba722d31", "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import xarray as xr" ] }, { "cell_type": "code", "execution_count": 16, "id": "ec051ccf", "metadata": {}, "outputs": [], "source": [ "field = 'flux'" ] }, { "cell_type": "code", "execution_count": 17, "id": "efc4be5b", "metadata": {}, "outputs": [], "source": [ "chunk_size = int(1e6)" ] }, { "cell_type": "code", "execution_count": 18, "id": "988dd280", "metadata": {}, "outputs": [], "source": [ "cat_xr = xr.open_dataset(\"catalog_100.0.h5\", chunks=chunk_size)\n", "cat_xr = cat_xr.rename({\"phony_dim_0\":\"index\"})" ] }, { "cell_type": "code", "execution_count": 19, "id": "17200e51", "metadata": {}, "outputs": [], "source": [ "cutoff_mask = (cat_xr.flux < cutoff_flux).compute()" ] }, { "cell_type": "code", "execution_count": 20, "id": "fb1ce190", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The jupyter_ai extension is already loaded. To reload it, use:\n", " %reload_ext jupyter_ai\n" ] }, { "data": { "text/markdown": [ "| Provider | Environment variable | Set? | Models |\n", "|----------|----------------------|------|--------|\n", "| `gemini` | `GOOGLE_API_KEY` | | |\n" ], "text/plain": [ "gemini\n", "Requires environment variable: GOOGLE_API_KEY (set)\n", "* gemini:gemini-1.0-pro\n", "* gemini:gemini-1.0-pro-001\n", "* gemini:gemini-1.0-pro-latest\n", "* gemini:gemini-1.0-pro-vision-latest\n", "* gemini:gemini-pro\n", "* gemini:gemini-pro-vision\n", "\n" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%load_ext jupyter_ai\n", "%ai list gemini" ] }, { "cell_type": "code", "execution_count": 21, "id": "878d216c", "metadata": {}, "outputs": [], "source": [ "#%%ai gemini:gemini-pro -f code" ] }, { "cell_type": "code", "execution_count": 22, "id": "38e1a2ed", "metadata": {}, "outputs": [], "source": [ "pol_coeff = xr.open_dataarray(\n", " \"/pscratch/sd/z/zonca/websky_full_catalog_polarized flux.h5\", chunks=chunk_size)[:, cutoff_mask]" ] }, { "cell_type": "code", "execution_count": 23, "id": "3eeaf786", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.DataArray 'flux' (power: 5, index: 281384121)>\n",
       "dask.array<getitem, shape=(5, 281384121), dtype=float64, chunksize=(5, 999136), chunktype=numpy.ndarray>\n",
       "Coordinates:\n",
       "  * power    (power) int64 4 3 2 1 0\n",
       "  * index    (index) int64 0 1 2 3 4 ... 281756372 281756373 281756374 281756375
" ], "text/plain": [ "\n", "dask.array\n", "Coordinates:\n", " * power (power) int64 4 3 2 1 0\n", " * index (index) int64 0 1 2 3 4 ... 281756372 281756373 281756374 281756375" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pol_coeff" ] }, { "cell_type": "code", "execution_count": 24, "id": "07789546", "metadata": {}, "outputs": [], "source": [ "temp_coeff = xr.open_dataarray(\"/pscratch/sd/z/zonca/websky_full_catalog_flux.h5\", chunks=chunk_size)[:, cutoff_mask]" ] }, { "cell_type": "code", "execution_count": 25, "id": "40648c82", "metadata": {}, "outputs": [], "source": [ "output_catalog = xr.Dataset({\"logpolycoefpolflux\":pol_coeff,\"logpolycoefflux\":temp_coeff })" ] }, { "cell_type": "code", "execution_count": 27, "id": "449d730c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.Dataset>\n",
       "Dimensions:             (power: 5, index: 281384121)\n",
       "Coordinates:\n",
       "  * power               (power) int64 4 3 2 1 0\n",
       "  * index               (index) int64 0 1 2 3 ... 281756373 281756374 281756375\n",
       "Data variables:\n",
       "    logpolycoefpolflux  (index, power) float64 dask.array<chunksize=(999136, 5), meta=np.ndarray>\n",
       "    logpolycoefflux     (index, power) float64 dask.array<chunksize=(999136, 5), meta=np.ndarray>
" ], "text/plain": [ "\n", "Dimensions: (power: 5, index: 281384121)\n", "Coordinates:\n", " * power (power) int64 4 3 2 1 0\n", " * index (index) int64 0 1 2 3 ... 281756373 281756374 281756375\n", "Data variables:\n", " logpolycoefpolflux (index, power) float64 dask.array\n", " logpolycoefflux (index, power) float64 dask.array" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_catalog" ] }, { "cell_type": "code", "execution_count": 33, "id": "788118dd", "metadata": {}, "outputs": [], "source": [ "output_catalog = output_catalog.transpose()" ] }, { "cell_type": "code", "execution_count": 34, "id": "b4fe03d8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(281384121, 5)" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_catalog[\"logpolycoefflux\"].shape" ] }, { "cell_type": "code", "execution_count": 35, "id": "64734e1b", "metadata": {}, "outputs": [], "source": [ "output_catalog.logpolycoefflux.attrs[\"units\"] = \"Jy\"\n", "output_catalog.logpolycoefpolflux.attrs[\"units\"] = \"Jy\"" ] }, { "cell_type": "code", "execution_count": 36, "id": "b7eaad17", "metadata": {}, "outputs": [], "source": [ "for coord in [\"theta\", \"phi\"]:\n", " output_catalog = output_catalog.assign_coords(\n", " **{coord:((\"index\"), cat_xr[coord][cutoff_mask].data)})" ] }, { "cell_type": "code", "execution_count": 37, "id": "b5b3f316", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/global/common/software/cmb/zonca/conda/pycmb/lib/python3.10/site-packages/distributed/client.py:3361: UserWarning: Sending large graph of size 1.05 GiB.\n", "This may cause some slowdown.\n", "Consider loading the data with Dask directly\n", " or using futures or delayed objects to embed the data into the graph without repetition.\n", "See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.\n", " warnings.warn(\n", "/global/common/software/cmb/zonca/conda/pycmb/lib/python3.10/site-packages/numpy/polynomial/polynomial.py:756: RuntimeWarning: overflow encountered in multiply\n", " c0 = c[-i] + c0*x\n" ] } ], "source": [ "output_catalog[\"flux_100\"] = np.polynomial.polynomial.polyval(\n", " np.log(100), output_catalog[\"logpolycoefflux\"][:,::-1], tensor=False)" ] }, { "cell_type": "code", "execution_count": 38, "id": "945010ff", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.DataArray 'flux_100' ()>\n",
       "array(inf)
" ], "text/plain": [ "\n", "array(inf)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_catalog[\"flux_100\"].max()" ] }, { "cell_type": "code", "execution_count": 39, "id": "a3d49ed2", "metadata": {}, "outputs": [], "source": [ "output_catalog = output_catalog.sortby(\"flux_100\", ascending=False)\n", "del output_catalog[\"flux_100\"]" ] }, { "cell_type": "code", "execution_count": 40, "id": "997cca47", "metadata": {}, "outputs": [], "source": [ "output_catalog.coords[\"theta\"].attrs[\"units\"] = \"rad\"\n", "output_catalog.coords[\"phi\"].attrs[\"units\"] = \"rad\"" ] }, { "cell_type": "code", "execution_count": 41, "id": "a04ad716", "metadata": {}, "outputs": [], "source": [ "output_catalog.attrs[\"notes\"] = \\\n", "\"\"\"Catalog of sources where the flux in Jy at any frequency is calculated with a 5th order polynomial in the logarithm of the frequency in GHz, separately for temperature and polarization.\n", "The catalog does not contain information about the polarization angle of a source.\n", "The catalog sorted in descending order based on the source flux at 100 GHz\"\"\"" ] }, { "cell_type": "code", "execution_count": 44, "id": "918171cd", "metadata": {}, "outputs": [], "source": [ "output_filename = f\"/pscratch/sd/z/zonca/websky_full_catalog_trasp.h5\"" ] }, { "cell_type": "code", "execution_count": 42, "id": "6e73f4df", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/global/common/software/cmb/zonca/conda/pycmb/lib/python3.10/site-packages/distributed/client.py:3361: UserWarning: Sending large graph of size 2.10 GiB.\n", "This may cause some slowdown.\n", "Consider loading the data with Dask directly\n", " or using futures or delayed objects to embed the data into the graph without repetition.\n", "See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.\n", " warnings.warn(\n", "2024-09-21 14:02:36,894 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 2.99 GiB -- Worker memory limit: 3.72 GiB\n", "2024-09-21 14:02:36,934 - distributed.worker.memory - WARNING - Worker is at 79% memory usage. Resuming worker. Process memory: 2.95 GiB -- Worker memory limit: 3.72 GiB\n", "2024-09-21 14:02:37,039 - distributed.worker.memory - WARNING - Worker is at 81% memory usage. Pausing worker. Process memory: 3.05 GiB -- Worker memory limit: 3.72 GiB\n", "2024-09-21 14:02:37,159 - distributed.worker.memory - WARNING - Worker is at 42% memory usage. Resuming worker. Process memory: 1.58 GiB -- Worker memory limit: 3.72 GiB\n", "2024-09-21 14:02:49,972 - distributed.worker.memory - WARNING - Worker is at 82% memory usage. Pausing worker. Process memory: 3.08 GiB -- Worker memory limit: 3.72 GiB\n", "2024-09-21 14:02:50,525 - distributed.worker.memory - WARNING - Worker is at 78% memory usage. Resuming worker. Process memory: 2.92 GiB -- Worker memory limit: 3.72 GiB\n" ] } ], "source": [ "output_catalog.to_netcdf(\n", " output_filename, format=\"NETCDF4\") # requires netcdf4 package" ] }, { "cell_type": "code", "execution_count": 45, "id": "c95cf867", "metadata": {}, "outputs": [], "source": [ "import h5py" ] }, { "cell_type": "code", "execution_count": 46, "id": "b51507c3", "metadata": {}, "outputs": [], "source": [ "f = h5py.File(output_filename)" ] }, { "cell_type": "code", "execution_count": 47, "id": "750ab562", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.keys()" ] }, { "cell_type": "code", "execution_count": 50, "id": "6a9e291f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 5.37899652e-09, -1.29664725e-07, 1.20804354e-06, -5.22231671e-06,\n", " 9.00274650e-06])" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f[\"logpolycoefflux\"][0]" ] }, { "cell_type": "code", "execution_count": 1, "id": "f116bd47", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: benchmark-pixell-runner: command not found\r\n" ] } ], "source": [ "!benchmark-pixell-runner" ] }, { "cell_type": "code", "execution_count": 2, "id": "1948f953", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " total used free shared buff/cache available\r\n", "Mem: 515307 288278 237454 2284 3068 227028\r\n", "Swap: 0 0 0\r\n" ] } ], "source": [ "!free -m" ] }, { "cell_type": "code", "execution_count": 1, "id": "0166c39b", "metadata": {}, "outputs": [], "source": [ "import xarray as xr\n", "catalog = xr.open_dataset(\"/pscratch/sd/z/zonca/websky_full_catalog_trasp.h5\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "821355d7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.Dataset>\n",
       "Dimensions:             (power: 5, index: 281384121)\n",
       "Coordinates:\n",
       "  * power               (power) int64 4 3 2 1 0\n",
       "  * index               (index) int64 0 1 2 3 ... 281756373 281756374 281756375\n",
       "    theta               (index) float64 ...\n",
       "    phi                 (index) float64 ...\n",
       "Data variables:\n",
       "    logpolycoefpolflux  (index, power) float64 ...\n",
       "    logpolycoefflux     (index, power) float64 ...\n",
       "Attributes:\n",
       "    notes:    Catalog of sources where the flux in Jy at any frequency is cal...
" ], "text/plain": [ "\n", "Dimensions: (power: 5, index: 281384121)\n", "Coordinates:\n", " * power (power) int64 4 3 2 1 0\n", " * index (index) int64 0 1 2 3 ... 281756373 281756374 281756375\n", " theta (index) float64 ...\n", " phi (index) float64 ...\n", "Data variables:\n", " logpolycoefpolflux (index, power) float64 ...\n", " logpolycoefflux (index, power) float64 ...\n", "Attributes:\n", " notes: Catalog of sources where the flux in Jy at any frequency is cal..." ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catalog" ] }, { "cell_type": "code", "execution_count": 3, "id": "69ade451", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(281384121, 5)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catalog[\"logpolycoefflux\"].shape" ] }, { "cell_type": "code", "execution_count": null, "id": "8c123c12", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "@webio": { "lastCommId": null, "lastKernelId": null }, "kernelspec": { "display_name": "pycmb", "language": "python", "name": "pycmb" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }