{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Create WebSky background catalog with Dask\n" ], "id": "dfbf38472bca" }, { "cell_type": "code", "execution_count": 1, "id": "0ed31a46-d481-4958-af82-3889e2f6b80a", "metadata": { "tags": [] }, "outputs": [], "source": [ "import h5pickle as h5py\n", "import numpy as np\n", "import healpy as hp\n", "import matplotlib.pyplot as plt\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "id": "4d97b61d", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "| Provider | Environment variable | Set? | Models |\n", "|----------|----------------------|------|--------|\n", "| `gemini` | `GOOGLE_API_KEY` | | |\n" ], "text/plain": [ "gemini\n", "Requires environment variable: GOOGLE_API_KEY (set)\n", "* gemini:gemini-1.0-pro\n", "* gemini:gemini-1.0-pro-001\n", "* gemini:gemini-1.0-pro-latest\n", "* gemini:gemini-1.0-pro-vision-latest\n", "* gemini:gemini-pro\n", "* gemini:gemini-pro-vision\n", "\n" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%load_ext jupyter_ai\n", "%ai list gemini" ] }, { "cell_type": "code", "execution_count": 3, "id": "6f9001d6", "metadata": {}, "outputs": [], "source": [ "#%%ai gemini:gemini-pro -f code\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "0dcee1ff", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import healpy as hp\n", "hp.version" ] }, { "cell_type": "code", "execution_count": 5, "id": "66420d6e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Created `%gm` as an alias for `%ai gemini:gemini-pro -f code`.\n", "Created `%%gm` as an alias for `%%ai gemini:gemini-pro -f code`.\n" ] } ], "source": [ "%alias_magic gm ai -p \"gemini:gemini-pro -f code\"" ] }, { "cell_type": "code", "execution_count": 6, "id": "263a8761", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "num_threads = 128\n", "os.environ[\"OMP_NUM_THREADS\"] = \"1\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "5f8de1f5", "metadata": {}, "outputs": [], "source": [ "cutoff_flux = 10000" ] }, { "cell_type": "code", "execution_count": 8, "id": "1e31f0a2", "metadata": {}, "outputs": [], "source": [ "output_filename = \"/pscratch/sd/z/zonca/websky_full_catalog.h5\"" ] }, { "cell_type": "code", "execution_count": 9, "id": "82d11cad", "metadata": {}, "outputs": [], "source": [ "plot = False" ] }, { "cell_type": "code", "execution_count": 10, "id": "8e35f4b9-bc39-49e9-af61-67464526f4d9", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/global/cfs/cdirs/sobs/www/users/Radio_WebSky/matched_catalogs_2\n" ] } ], "source": [ "cd /global/cfs/cdirs/sobs/www/users/Radio_WebSky/matched_catalogs_2" ] }, { "cell_type": "code", "execution_count": 11, "id": "031d3e3d-b9d0-4c73-80bb-bbd804a825fe", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "catalog_100.0.h5 catalog_232.0.h5 catalog_353.0.h5 catalog_643.0.h5\r\n", "catalog_111.0.h5 catalog_24.5.h5 catalog_375.0.h5 catalog_67.8.h5\r\n", "catalog_129.0.h5 catalog_256.0.h5 catalog_409.0.h5 catalog_70.0.h5\r\n", "catalog_143.0.h5 catalog_27.3.h5 catalog_41.7.h5 catalog_729.0.h5\r\n", "catalog_153.0.h5 catalog_275.0.h5 catalog_44.0.h5 catalog_73.7.h5\r\n", "catalog_164.0.h5 catalog_294.0.h5 catalog_467.0.h5 catalog_79.6.h5\r\n", "catalog_18.7.h5 catalog_30.0.h5 catalog_47.4.h5 catalog_817.0.h5\r\n", "catalog_189.0.h5 catalog_306.0.h5 catalog_525.0.h5 catalog_857.0.h5\r\n", "catalog_21.6.h5 catalog_314.0.h5 catalog_545.0.h5 catalog_90.2.h5\r\n", "catalog_210.0.h5 catalog_340.0.h5 catalog_584.0.h5 catalog_906.0.h5\r\n", "catalog_217.0.h5 catalog_35.9.h5 catalog_63.9.h5 flux_coeff.h5\r\n" ] } ], "source": [ "%ls" ] }, { "cell_type": "code", "execution_count": 12, "id": "ba71f7d1", "metadata": { "tags": [] }, "outputs": [], "source": [ "freqs = [\n", " \"18.7\",\n", " \"24.5\",\n", " \"44.0\",\n", " \"70.0\",\n", " \"100.0\",\n", " \"143.0\",\n", " \"217.0\",\n", " \"353.0\",\n", " \"545.0\",\n", " \"643.0\",\n", " \"729.0\",\n", " \"857.0\",\n", " \"906.0\",\n", "]" ] }, { "cell_type": "code", "execution_count": 13, "id": "ec6aeb18", "metadata": {}, "outputs": [], "source": [ "freqs_array = np.array(list(map(float, freqs)))" ] }, { "cell_type": "code", "execution_count": 14, "id": "d0653ac2-3d67-4480-849d-bcca2727a143", "metadata": { "tags": [] }, "outputs": [], "source": [ "cat = h5py.File(\"catalog_100.0.h5\", \"r\")" ] }, { "cell_type": "code", "execution_count": 18, "id": "fd871a2b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1.64009452, 1.64009452, 1.64009452, 1.69043016], dtype='>f8')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat[\"theta\"][:4]" ] }, { "cell_type": "code", "execution_count": 15, "id": "d06f8c9e", "metadata": {}, "outputs": [], "source": [ "#%%ai gemini:gemini-pro -f code\n", " \n", "#find the fields in a h5py File" ] }, { "cell_type": "code", "execution_count": 16, "id": "b02cba19", "metadata": {}, "outputs": [], "source": [ "import dask.array as da" ] }, { "cell_type": "markdown", "id": "80b4c835", "metadata": {}, "source": [ "There are no metadata in the file, I guess fluxes are in `Jy`" ] }, { "cell_type": "code", "execution_count": 17, "id": "1b709764", "metadata": {}, "outputs": [], "source": [ "from dask.distributed import Client, LocalCluster\n", "\n", "cluster = LocalCluster(n_workers=num_threads, threads_per_worker=1, processes=True)\n", "client = Client(cluster)" ] }, { "cell_type": "code", "execution_count": 18, "id": "886dbbaf-8890-449c-bab4-2f7eba722d31", "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import xarray as xr" ] }, { "cell_type": "code", "execution_count": 19, "id": "ec051ccf", "metadata": {}, "outputs": [], "source": [ "field = 'flux'" ] }, { "cell_type": "code", "execution_count": 20, "id": "07789546", "metadata": {}, "outputs": [], "source": [ "arrays = [da.from_array(h5py.File(f\"catalog_{freq}.h5\", \"r\")[field], chunks=1000000) for freq in freqs]" ] }, { "cell_type": "code", "execution_count": 21, "id": "6bb7555a", "metadata": {}, "outputs": [], "source": [ "flux = da.stack(arrays, axis=0) " ] }, { "cell_type": "code", "execution_count": 22, "id": "f1e3953e", "metadata": {}, "outputs": [], "source": [ "flux = flux.rechunk(chunks=(13, 1000000))" ] }, { "cell_type": "code", "execution_count": 23, "id": "1e17152a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Array Chunk
Bytes 27.29 GiB 99.18 MiB
Shape (13, 281756376) (13, 1000000)
Dask graph 282 chunks in 41 graph layers
Data type float64 numpy.ndarray
\n", "
\n", " \n", "\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " 281756376\n", " 13\n", "\n", "
" ], "text/plain": [ "dask.array" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "flux" ] }, { "cell_type": "code", "execution_count": 24, "id": "64e8f31e", "metadata": {}, "outputs": [], "source": [ "# Only keep sources below cutoff\n", "cutoff_flux_Jy = 1e-3\n", "# flux = flux[:, flux[4, :] < cutoff_flux_Jy]" ] }, { "cell_type": "code", "execution_count": 25, "id": "c442dd8e", "metadata": {}, "outputs": [], "source": [ "# flux.compute_chunk_sizes()" ] }, { "cell_type": "code", "execution_count": 26, "id": "a1ecd6d8", "metadata": {}, "outputs": [], "source": [ "from numba import njit\n", "\n", "@njit\n", "def model(freq, a, b, c, d, e):\n", " log_freq = np.log(freq)\n", " return a * log_freq**4 + b * log_freq**3 + c * log_freq**2 + d * log_freq + e" ] }, { "cell_type": "code", "execution_count": 27, "id": "3c08d830", "metadata": {}, "outputs": [], "source": [ "from scipy.optimize import curve_fit" ] }, { "cell_type": "code", "execution_count": 28, "id": "c30eb71c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 5.37899652e-09, -1.29664725e-07, 1.20804354e-06, -5.22231671e-06,\n", " 9.00274650e-06])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "curve_fit(model, freqs_array, flux[:,0])[0]" ] }, { "cell_type": "code", "execution_count": 29, "id": "efefaf03", "metadata": {}, "outputs": [], "source": [ "def run_curve_fit(flux):\n", " return curve_fit(model, freqs_array, flux)[0]\n", "\n", "coeff = da.apply_along_axis(run_curve_fit, 0, flux)" ] }, { "cell_type": "code", "execution_count": 30, "id": "fd0813a7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2min 2s, sys: 1min 28s, total: 3min 31s\n", "Wall time: 5min 4s\n" ] }, { "data": { "text/plain": [ "array([[ 5.37899652e-09, 6.42822511e-09, 3.31959319e-09,\n", " 1.38862575e-08, 3.78195705e-09, 1.08966713e-08,\n", " 8.68670067e-08, 3.64081373e-09, 3.66938830e-09,\n", " 3.02356547e-09],\n", " [-1.29664725e-07, -1.25727514e-07, -6.53823561e-08,\n", " -2.67337421e-07, -8.86265294e-08, -2.11607996e-07,\n", " -1.96845594e-06, -8.47114797e-08, -7.11052032e-08,\n", " -6.49361314e-08],\n", " [ 1.20804354e-06, 8.85551144e-07, 4.83957483e-07,\n", " 1.85677682e-06, 8.11717509e-07, 1.47786561e-06,\n", " 1.66899947e-05, 7.71488341e-07, 5.08938118e-07,\n", " 5.45560777e-07],\n", " [-5.22231671e-06, -2.61195734e-06, -1.68793603e-06,\n", " -5.55460268e-06, -3.51684601e-06, -4.35226803e-06,\n", " -6.28894194e-05, -3.33517575e-06, -1.66773519e-06,\n", " -2.22563537e-06],\n", " [ 9.00274650e-06, 2.94062995e-06, 2.71462806e-06,\n", " 6.94587053e-06, 6.25448262e-06, 5.04064302e-06,\n", " 8.91598585e-05, 5.95275131e-06, 2.49203039e-06,\n", " 3.97960150e-06]])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "coeff[:,:10].compute()" ] }, { "cell_type": "code", "execution_count": 31, "id": "6769c4cb", "metadata": {}, "outputs": [], "source": [ "import xarray as xr" ] }, { "cell_type": "code", "execution_count": 32, "id": "b160bebd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5, 281756376)" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coeff.shape" ] }, { "cell_type": "code", "execution_count": 33, "id": "f5329aa7", "metadata": {}, "outputs": [], "source": [ "xr_flux = xr.DataArray(\n", " data=coeff,\n", " coords={\"power\": np.arange(4, -1, -1), \"index\": da.arange(coeff.shape[1])},\n", " name=\"flux\",\n", ")" ] }, { "cell_type": "code", "execution_count": 34, "id": "6029553c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.DataArray 'flux' (power: 5, index: 281756376)>\n",
       "dask.array<run_curve_fit-along-axis, shape=(5, 281756376), dtype=float64, chunksize=(5, 1000000), chunktype=numpy.ndarray>\n",
       "Coordinates:\n",
       "  * power    (power) int64 4 3 2 1 0\n",
       "  * index    (index) int64 0 1 2 3 4 ... 281756372 281756373 281756374 281756375
" ], "text/plain": [ "\n", "dask.array\n", "Coordinates:\n", " * power (power) int64 4 3 2 1 0\n", " * index (index) int64 0 1 2 3 4 ... 281756372 281756373 281756374 281756375" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xr_flux" ] }, { "cell_type": "code", "execution_count": 35, "id": "6e73f4df", "metadata": {}, "outputs": [], "source": [ "xr_flux.to_netcdf(\n", " f\"/pscratch/sd/z/zonca/websky_full_catalog_{field}.h5\", format=\"NETCDF4\") # requires netcdf4 package" ] }, { "cell_type": "code", "execution_count": 36, "id": "bab920ff", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.DataArray 'flux' (power: 5, index: 281756376)>\n",
       "dask.array<run_curve_fit-along-axis, shape=(5, 281756376), dtype=float64, chunksize=(5, 1000000), chunktype=numpy.ndarray>\n",
       "Coordinates:\n",
       "  * power    (power) int64 4 3 2 1 0\n",
       "  * index    (index) int64 0 1 2 3 4 ... 281756372 281756373 281756374 281756375
" ], "text/plain": [ "\n", "dask.array\n", "Coordinates:\n", " * power (power) int64 4 3 2 1 0\n", " * index (index) int64 0 1 2 3 4 ... 281756372 281756373 281756374 281756375" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xr_flux" ] }, { "cell_type": "code", "execution_count": null, "id": "c95cf867", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "@webio": { "lastCommId": null, "lastKernelId": null }, "kernelspec": { "display_name": "pycmb", "language": "python", "name": "pycmb" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }