Inspect WebSky high-flux catalog output¶
[1]:
import h5py
import numpy as np
import healpy as hp
import matplotlib.pyplot as plt
[2]:
freqs = [
"18.7",
"24.5",
"44.0",
"70.0",
"100.0",
"143.0",
"217.0",
"353.0",
"545.0",
"643.0",
"729.0",
"857.0",
"906.0",
]
[3]:
cat = h5py.File("data/matched_catalogs_2/catalog_100.0.h5", "r")
There are no metadata in the file, I guess fluxes are in Jy
[4]:
cutoff_flux = 1e-3
[5]:
high_flux_sources_mask = cat["flux"][:] > cutoff_flux
[6]:
(high_flux_sources_mask).sum()
[6]:
372255
[7]:
high_flux_sources_mask.mean() * 100
[7]:
0.13211945911740433
[8]:
for k, v in cat.items():
print(k, v[:3])
flux [3.24291534e-07 3.16862867e-07 3.17171157e-07]
phi [3.22861886 3.22861886 3.22861886]
polarized flux [1.42910628e-09 1.99535624e-08 2.29563857e-09]
theta [1.64009452 1.64009452 1.64009452]
[9]:
(all_indices,) = np.nonzero(high_flux_sources_mask)
[10]:
len(all_indices)
[10]:
372255
[11]:
all_indices = np.array(sorted(all_indices))
all_indices contains the index of the sources in the original Websky catalog
[12]:
all_indices
[12]:
array([ 11253, 16428, 24110, ..., 281755117, 281755430,
281755795])
[13]:
import pandas as pd
import xarray as xr
[14]:
columns = ["theta", "phi", "flux", "polarized flux"]
[15]:
flux = xr.DataArray(
data=np.zeros((len(all_indices), len(freqs)), dtype=np.float64),
coords={"index": all_indices, "freq": list(map(float, freqs))},
name="flux",
)
fluxnorm = flux.copy()
[16]:
polarized_flux = flux.copy()
[17]:
sources_xr = xr.Dataset(
{"flux": flux, "polarized_flux": polarized_flux, "fluxnorm": fluxnorm}
)
for freq in freqs:
print(freq)
cat = h5py.File(f"data/matched_catalogs_2/catalog_{freq}.h5", "r")
for column in ["flux", "polarized_flux"]:
sources_xr[column].loc[dict(index=all_indices, freq=float(freq))] = cat[
column.replace("_", " ")
][high_flux_sources_mask]
18.7
24.5
44.0
70.0
100.0
143.0
217.0
353.0
545.0
643.0
729.0
857.0
906.0
[18]:
sources_xr
[18]:
<xarray.Dataset> Size: 119MB
Dimensions: (index: 372255, freq: 13)
Coordinates:
* index (index) int64 3MB 11253 16428 24110 ... 281755430 281755795
* freq (freq) float64 104B 18.7 24.5 44.0 ... 729.0 857.0 906.0
Data variables:
flux (index, freq) float64 39MB 0.02854 0.01962 ... 0.001688
polarized_flux (index, freq) float64 39MB 0.001121 0.0004986 ... 3.246e-05
fluxnorm (index, freq) float64 39MB 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0[19]:
min_idx = int(sources_xr.flux.sel(freq=100).argmin().item())
max_idx = int(sources_xr.flux.sel(freq=100).argmax().item())
print("Index with minimum flux at 100 GHz:", min_idx)
print("Index with maximum flux at 100 GHz:", max_idx)
Index with minimum flux at 100 GHz: 64456
Index with maximum flux at 100 GHz: 9474
[20]:
sources_xr = sources_xr.sortby(sources_xr.flux.sel(freq=100.0), ascending=False)
[21]:
len(sources_xr)
[21]:
3
[22]:
max_loc = int(sources_xr.flux.sel(freq=100).argmax().item())
min_loc = int(sources_xr.flux.sel(freq=100).argmin().item())
print("Location (position) of maximum flux at 100 GHz:", max_loc)
print("Location (position) of minimum flux at 100 GHz:", min_loc)
Location (position) of maximum flux at 100 GHz: 0
Location (position) of minimum flux at 100 GHz: 372254
[23]:
sources_xr.coords["index"]
[23]:
<xarray.DataArray 'index' (index: 372255)> Size: 3MB array([ 27897657, 242175600, 270516228, ..., 246732122, 263347056, 166742332]) Coordinates: * index (index) int64 3MB 27897657 242175600 ... 263347056 166742332
[24]:
# this was needed for some missing functionality in an older version of xarray
#sources_xr.coords["index"] = np.arange(len(sources_xr.coords["index"]))
[25]:
sources_xr["fluxnorm"] = sources_xr["flux"] / sources_xr["flux"].sel(freq=100)
[26]:
#print(sources_xr["fluxnorm"].loc[dict(index=s)], sources_xr["flux"].loc[dict(index=s)])
[27]:
#sources_xr.fluxnorm.plot(vmin=0, vmax=100)
#plt.figure()
#sources_xr.flux.plot(vmin=0, vmax=100)
[28]:
sources_xr["logpolycoefflux"] = xr.DataArray(
np.zeros((len(all_indices), 5), dtype=np.float64),
dims=["index", "power"],
coords={"power": np.arange(5)[::-1]},
)
sources_xr["logpolycoefnorm"] = sources_xr["logpolycoefflux"].copy()
sources_xr["logpolycoefpolflux"] = sources_xr["logpolycoefflux"].copy()
[29]:
from scipy.optimize import curve_fit
import time
def is_interactive():
try:
from IPython import get_ipython
return get_ipython() is not None
except ImportError:
return False
def model(freq, a, b, c, d, e):
log_freq = np.log(freq)
return a * log_freq**4 + b * log_freq**3 + c * log_freq**2 + d * log_freq + e
start_time = time.time()
total = len(sources_xr.coords["index"])
indices = sources_xr.coords["index"]
if is_interactive():
from tqdm.notebook import tqdm
iterator = tqdm(indices, desc="Fitting sources")
else:
iterator = indices
for i, s in enumerate(iterator):
sources_xr["logpolycoefflux"].loc[dict(index=s)] = curve_fit(
model, sources_xr.coords["freq"], sources_xr.flux.sel(index=s)
)[0]
sources_xr["logpolycoefpolflux"].loc[dict(index=s)] = curve_fit(
model, sources_xr.coords["freq"], sources_xr.polarized_flux.sel(index=s)
)[0]
sources_xr["logpolycoefnorm"].loc[dict(index=s)] = curve_fit(
model, sources_xr.coords["freq"], sources_xr.fluxnorm.sel(index=s)
)[0]
if (i + 1) % 100 == 0 or (i + 1) == total:
elapsed = time.time() - start_time
rate = (i + 1) / elapsed
remaining = (total - (i + 1)) / rate if rate > 0 else float("inf")
msg = f"ETA: {int(remaining // 3600)}h {int((remaining % 3600) // 60)}m"
if is_interactive():
iterator.set_postfix_str(msg)
else:
print(f"Processed {i+1}/{total} | {msg}")
[30]:
# for s in range(len(all_indices)):
# plt.figure()
# sources_xr.flux.sel(index=s).plot(marker="o", linestyle="none") # , xscale="log")
# sources_xr.fluxnorm.sel(index=s).plot(
# marker="o", linestyle="none"
# ) # , xscale="log")
# plt.loglog(
# sources_xr.coords["freq"],
# model(sources_xr.coords["freq"], *sources_xr.logpolycoefflux.sel(index=s)),
# )
# plt.loglog(
# sources_xr.coords["freq"],
# model(sources_xr.coords["freq"], *sources_xr.logpolycoefnorm.sel(index=s)),
# )
# plt.grid()
# break
[31]:
sources_xr.logpolycoefflux.min(), sources_xr.logpolycoefflux.max()
[31]:
(<xarray.DataArray 'logpolycoefflux' ()> Size: 8B
array(-17557.80083613),
<xarray.DataArray 'logpolycoefflux' ()> Size: 8B
array(23993.59700202))
[32]:
# plt.figure(figsize=(12, 5))
# plt.subplot(121)
# sources_xr.logpolycoefflux.plot(vmax=50, vmin=-50)
# plt.subplot(122)
# sources_xr.logpolycoefnorm.plot(vmax=50, vmin=-50)pwd
[33]:
# plt.figure(figsize=(15, 8))
# for power in range(5):
# plt.subplot(231 + power)
# np.fabs(sources_xr.logpolycoefflux.loc[dict(power=power)]).plot.hist(
# bins=np.logspace(-0, 4, 20), density=False, lw=3, label="fluxes"
# )
# np.fabs(sources_xr.logpolycoefnorm.loc[dict(power=power)]).plot.hist(
# bins=np.logspace(-0, 4, 20),
# density=False,
# histtype="step",
# lw=2,
# label="normalized fluxes",
# linestyle="--",
# )
# plt.grid()
# plt.title(f"Power {power}")
# plt.legend()
# plt.xscale("log")
# plt.xlabel(None)
[34]:
output_catalog = sources_xr[["logpolycoefflux","logpolycoefpolflux"]]
[35]:
#no need to override this again
#output_catalog["index"] = all_indices
[36]:
output_catalog.logpolycoefflux.attrs["units"] = "Jy"
output_catalog.logpolycoefpolflux.attrs["units"] = "Jy"
[37]:
for coord in ["theta", "phi"]:
output_catalog = output_catalog.assign_coords(**{coord:(("index"), cat[coord][high_flux_sources_mask].astype(np.float64))})
[38]:
output_catalog
[38]:
<xarray.Dataset> Size: 39MB
Dimensions: (index: 372255, power: 5)
Coordinates:
* index (index) int64 3MB 27897657 242175600 ... 166742332
* power (power) int64 40B 4 3 2 1 0
theta (index) float64 3MB 2.655 2.659 2.596 ... 1.377 1.426
phi (index) float64 3MB 4.177 3.977 4.17 ... 0.4638 0.5054
Data variables:
logpolycoefflux (index, power) float64 15MB 0.5214 -12.34 ... 0.08684
logpolycoefpolflux (index, power) float64 15MB -1.76 36.18 ... 0.02163[39]:
output_filename = "data/websky_high_flux_catalog_1mJy.h5"
[40]:
!ls -l $output_filename
ls: cannot access 'data/websky_high_flux_catalog_1mJy.h5': No such file or directory
[41]:
output_catalog.coords["theta"].attrs["units"] = "rad"
output_catalog.coords["phi"].attrs["units"] = "rad"
output_catalog.coords["theta"].attrs["reference_frame"] = "Galactic"
output_catalog.coords["phi"].attrs["reference_frame"] = "Galactic"
[42]:
output_catalog.attrs["description"] = (
"Websky catalog of sources with flux > 1 mJy at 100 GHz, fitted with a 4th order polynomial in log frequency. "
"Galactic reference frame. Sorted by flux at 100 GHz (descending). "
"The 'index' coordinate gives the original index in the Websky catalog."
)
[43]:
output_catalog.to_netcdf(output_filename, format="NETCDF4", mode="w") # requires netcdf4 package
[44]:
%ls -lah $output_filename
-rw-rw-r-- 1 azonca azonca 37M May 12 22:41 data/websky_high_flux_catalog_1mJy.h5
[45]:
import xarray
[46]:
xarray.open_dataset(output_filename)
[46]:
<xarray.Dataset> Size: 39MB
Dimensions: (index: 372255, power: 5)
Coordinates:
* index (index) int64 3MB 27897657 242175600 ... 166742332
* power (power) int64 40B 4 3 2 1 0
theta (index) float64 3MB ...
phi (index) float64 3MB ...
Data variables:
logpolycoefflux (index, power) float64 15MB ...
logpolycoefpolflux (index, power) float64 15MB ...
Attributes:
description: Websky catalog of sources with flux > 1 mJy at 100 GHz, fit...[47]:
import h5py
f = h5py.File(output_filename, 'r')
f["logpolycoefflux"]
[47]:
<HDF5 dataset "logpolycoefflux": shape (372255, 5), type "<f8">
[48]:
f["logpolycoefflux"].attrs["units"]
[48]:
b'Jy'