Inspect WebSky high-flux catalog output¶

[1]:

import h5py
import numpy as np
import healpy as hp
import matplotlib.pyplot as plt

[2]:

freqs = [
    "18.7",
    "24.5",
    "44.0",
    "70.0",
    "100.0",
    "143.0",
    "217.0",
    "353.0",
    "545.0",
    "643.0",
    "729.0",
    "857.0",
    "906.0",
]

[3]:

cat = h5py.File("data/matched_catalogs_2/catalog_100.0.h5", "r")

There are no metadata in the file, I guess fluxes are in Jy

[4]:

cutoff_flux = 1e-3

[5]:

high_flux_sources_mask = cat["flux"][:] > cutoff_flux

[6]:

(high_flux_sources_mask).sum()

[6]:

[7]:

high_flux_sources_mask.mean() * 100

[7]:

0.13211945911740433

[8]:

for k, v in cat.items():
    print(k, v[:3])

flux [3.24291534e-07 3.16862867e-07 3.17171157e-07]
phi [3.22861886 3.22861886 3.22861886]
polarized flux [1.42910628e-09 1.99535624e-08 2.29563857e-09]
theta [1.64009452 1.64009452 1.64009452]

[9]:

(all_indices,) = np.nonzero(high_flux_sources_mask)

[10]:

len(all_indices)

[10]:

[11]:

all_indices = np.array(sorted(all_indices))

all_indices contains the index of the sources in the original Websky catalog

[12]:

all_indices

[12]:

array([    11253,     16428,     24110, ..., 281755117, 281755430,
       281755795])

[13]:

import pandas as pd
import xarray as xr

[14]:

columns = ["theta", "phi", "flux", "polarized flux"]

[15]:

flux = xr.DataArray(
    data=np.zeros((len(all_indices), len(freqs)), dtype=np.float64),
    coords={"index": all_indices, "freq": list(map(float, freqs))},
    name="flux",
)
fluxnorm = flux.copy()

[16]:

polarized_flux = flux.copy()

[17]:

sources_xr = xr.Dataset(
    {"flux": flux, "polarized_flux": polarized_flux, "fluxnorm": fluxnorm}
)
for freq in freqs:
    print(freq)
    cat = h5py.File(f"data/matched_catalogs_2/catalog_{freq}.h5", "r")
    for column in ["flux", "polarized_flux"]:
        sources_xr[column].loc[dict(index=all_indices, freq=float(freq))] = cat[
            column.replace("_", " ")
        ][high_flux_sources_mask]

[18]:

sources_xr

[19]:

min_idx = int(sources_xr.flux.sel(freq=100).argmin().item())
max_idx = int(sources_xr.flux.sel(freq=100).argmax().item())
print("Index with minimum flux at 100 GHz:", min_idx)
print("Index with maximum flux at 100 GHz:", max_idx)

Index with minimum flux at 100 GHz: 64456
Index with maximum flux at 100 GHz: 9474

[20]:

sources_xr = sources_xr.sortby(sources_xr.flux.sel(freq=100.0), ascending=False)

[21]:

len(sources_xr)

[21]:

[22]:

max_loc = int(sources_xr.flux.sel(freq=100).argmax().item())
min_loc = int(sources_xr.flux.sel(freq=100).argmin().item())
print("Location (position) of maximum flux at 100 GHz:", max_loc)
print("Location (position) of minimum flux at 100 GHz:", min_loc)

Location (position) of maximum flux at 100 GHz: 0
Location (position) of minimum flux at 100 GHz: 372254

[23]:

sources_xr.coords["index"]

[23]:

<xarray.DataArray 'index' (index: 372255)> Size: 3MB
array([ 27897657, 242175600, 270516228, ..., 246732122, 263347056, 166742332])
Coordinates:
  * index    (index) int64 3MB 27897657 242175600 ... 263347056 166742332

[24]:

# this was needed for some missing functionality in an older version of xarray
#sources_xr.coords["index"] = np.arange(len(sources_xr.coords["index"]))

[25]:

sources_xr["fluxnorm"] = sources_xr["flux"] / sources_xr["flux"].sel(freq=100)

[26]:

#print(sources_xr["fluxnorm"].loc[dict(index=s)], sources_xr["flux"].loc[dict(index=s)])

[27]:

#sources_xr.fluxnorm.plot(vmin=0, vmax=100)
#plt.figure()
#sources_xr.flux.plot(vmin=0, vmax=100)

[28]:

sources_xr["logpolycoefflux"] = xr.DataArray(
    np.zeros((len(all_indices), 5), dtype=np.float64),
    dims=["index", "power"],
    coords={"power": np.arange(5)[::-1]},
)
sources_xr["logpolycoefnorm"] = sources_xr["logpolycoefflux"].copy()
sources_xr["logpolycoefpolflux"] = sources_xr["logpolycoefflux"].copy()

[29]:

from scipy.optimize import curve_fit
import time

def is_interactive():
    try:
        from IPython import get_ipython
        return get_ipython() is not None
    except ImportError:
        return False

def model(freq, a, b, c, d, e):
    log_freq = np.log(freq)
    return a * log_freq**4 + b * log_freq**3 + c * log_freq**2 + d * log_freq + e

start_time = time.time()
total = len(sources_xr.coords["index"])
indices = sources_xr.coords["index"]
if is_interactive():
    from tqdm.notebook import tqdm
    iterator = tqdm(indices, desc="Fitting sources")
else:
    iterator = indices

for i, s in enumerate(iterator):
    sources_xr["logpolycoefflux"].loc[dict(index=s)] = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.flux.sel(index=s)
    )[0]
    sources_xr["logpolycoefpolflux"].loc[dict(index=s)] = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.polarized_flux.sel(index=s)
    )[0]
    sources_xr["logpolycoefnorm"].loc[dict(index=s)] = curve_fit(
        model, sources_xr.coords["freq"], sources_xr.fluxnorm.sel(index=s)
    )[0]
    if (i + 1) % 100 == 0 or (i + 1) == total:
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        remaining = (total - (i + 1)) / rate if rate > 0 else float("inf")
        msg = f"ETA: {int(remaining // 3600)}h {int((remaining % 3600) // 60)}m"
        if is_interactive():
            iterator.set_postfix_str(msg)
        else:
            print(f"Processed {i+1}/{total} | {msg}")

[30]:

# for s in range(len(all_indices)):
#     plt.figure()
#     sources_xr.flux.sel(index=s).plot(marker="o", linestyle="none")  # , xscale="log")
#     sources_xr.fluxnorm.sel(index=s).plot(
#         marker="o", linestyle="none"
#     )  # , xscale="log")

#     plt.loglog(
#         sources_xr.coords["freq"],
#         model(sources_xr.coords["freq"], *sources_xr.logpolycoefflux.sel(index=s)),
#     )
#     plt.loglog(
#         sources_xr.coords["freq"],
#         model(sources_xr.coords["freq"], *sources_xr.logpolycoefnorm.sel(index=s)),
#     )
#     plt.grid()
#     break

[31]:

sources_xr.logpolycoefflux.min(), sources_xr.logpolycoefflux.max()

[31]:

(<xarray.DataArray 'logpolycoefflux' ()> Size: 8B
 array(-17557.80083613),
 <xarray.DataArray 'logpolycoefflux' ()> Size: 8B
 array(23993.59700202))

[32]:

# plt.figure(figsize=(12, 5))
# plt.subplot(121)
# sources_xr.logpolycoefflux.plot(vmax=50, vmin=-50)
# plt.subplot(122)
# sources_xr.logpolycoefnorm.plot(vmax=50, vmin=-50)pwd

[33]:

# plt.figure(figsize=(15, 8))

# for power in range(5):
#     plt.subplot(231 + power)

#     np.fabs(sources_xr.logpolycoefflux.loc[dict(power=power)]).plot.hist(
#         bins=np.logspace(-0, 4, 20), density=False, lw=3, label="fluxes"
#     )

#     np.fabs(sources_xr.logpolycoefnorm.loc[dict(power=power)]).plot.hist(
#         bins=np.logspace(-0, 4, 20),
#         density=False,
#         histtype="step",
#         lw=2,
#         label="normalized fluxes",
#         linestyle="--",
#     )
#     plt.grid()
#     plt.title(f"Power {power}")
#     plt.legend()
#     plt.xscale("log")
#     plt.xlabel(None)

[34]:

output_catalog = sources_xr[["logpolycoefflux","logpolycoefpolflux"]]

[35]:

#no need to override this again
#output_catalog["index"] = all_indices

[36]:

output_catalog.logpolycoefflux.attrs["units"] = "Jy"
output_catalog.logpolycoefpolflux.attrs["units"] = "Jy"

[37]:

for coord in ["theta", "phi"]:
    output_catalog = output_catalog.assign_coords(**{coord:(("index"), cat[coord][high_flux_sources_mask].astype(np.float64))})

[38]:

output_catalog

[39]:

output_filename = "data/websky_high_flux_catalog_1mJy.h5"

[40]:

!ls -l $output_filename

ls: cannot access 'data/websky_high_flux_catalog_1mJy.h5': No such file or directory

[41]:

output_catalog.coords["theta"].attrs["units"] = "rad"
output_catalog.coords["phi"].attrs["units"] = "rad"
output_catalog.coords["theta"].attrs["reference_frame"] = "Galactic"
output_catalog.coords["phi"].attrs["reference_frame"] = "Galactic"

[42]:

output_catalog.attrs["description"] = (
    "Websky catalog of sources with flux > 1 mJy at 100 GHz, fitted with a 4th order polynomial in log frequency. "
    "Galactic reference frame. Sorted by flux at 100 GHz (descending). "
    "The 'index' coordinate gives the original index in the Websky catalog."
)

[43]:

output_catalog.to_netcdf(output_filename, format="NETCDF4", mode="w")  # requires netcdf4 package

[44]:

%ls -lah $output_filename

-rw-rw-r-- 1 azonca azonca 37M May 12 22:41 data/websky_high_flux_catalog_1mJy.h5

[45]:

import xarray

[46]:

xarray.open_dataset(output_filename)

[47]:

import h5py
f = h5py.File(output_filename, 'r')
f["logpolycoefflux"]

[47]:

<HDF5 dataset "logpolycoefflux": shape (372255, 5), type "<f8">

[48]:

f["logpolycoefflux"].attrs["units"]

[48]:

b'Jy'

Navigation

Inspect WebSky high-flux catalog output¶