"""
The SCIMES core package
"""
import numpy as np
import random
from itertools import combinations, cycle
from matplotlib import pyplot as plt
from astropy.io import fits
from astropy.table import Column
from sklearn import metrics
from .spectral import spectral_clustering
def mat_smooth(Mat, scalpar = 0, lscal = False):
"""
Estimate the scaling parameter and rescale
the affinity matrix through a Gaussian kernel.
Parameters
-----------
Mat: numpy array
The affinity matrix to be rescaled
scalpar: float
User-defined scaling parameter
lscal: boll
Rescale the matrix using a local
scaling approach
Return
-------
NM: numpy array
Rescaled affinity matrix
sigmas: float
The estimated scaling parameter
"""
# Using estimated global scaling
if scalpar == 0 and lscal == False:
Aff = np.unique(Mat.ravel())[1::]
psigmas = (Aff+np.roll(Aff,-1))/2
psigmas = psigmas[1:-1]
diff = np.roll(Aff,-1)-Aff
diff = diff[1:-1]
sel_diff_ind = np.min(np.argsort(diff)[::-1][0:5])
sigmas = psigmas[sel_diff_ind]**2
print '-- Estimated scaling parameter:', np.sqrt(sigmas)
# Using local scaling
if scalpar == 0 and lscal == True:
print "-- Local scaling"
dr = np.std(Mat, axis=0)
sigmar = np.tile(dr,(Mat.shape[0],1))
sigmas = sigmar*sigmar.T
# Using user-defined scaling parameter
if scalpar != 0:
print "-- User defined scaling parameter:", scalpar
sigmas = scalpar**2
NM = np.exp(-(Mat**2)/sigmas)
NM[range(NM.shape[0]), range(NM.shape[1])] = 0
return NM, sigmas
def aff_matrix(allleavidx, alllevels, dictparents, dictprops):
"""
Generate the affinity matrices
Parameters
-----------
allleavidx: list
List of all leaf indexes within the
dendrogram
alllevels: list
Dendrogram levels of all structures
dictparents: dictionary
Parents and ancestors of all leaves
within the dendrogram
dictprops: dictionary
Properties of all leaf parents and
ancestors within the dendrogram
Return
-------
WAs: numpy array
Volume and luminosity affinity matrices
"""
print "- Creating affinity matrices"
num = len(allleavidx)
WAs = np.zeros((2,num,num))
volumes = dictprops['volumes']
luminosities = dictprops['luminosities']
# Let's save one for loop
combs = list(combinations(xrange(num), 2))
ncombs = len(combs)
# Going through the branch
for i in range(ncombs):
icont = combs[i][0]
jcont = combs[i][1]
i_idx = allleavidx[icont]
imat = allleavidx.index(i_idx)
j_idx = allleavidx[jcont]
jmat = allleavidx.index(j_idx)
ipars = dictparents[str(i_idx)]
jpars = dictparents[str(j_idx)]
# Find shorter list for the comparison
lpars = min(ipars,jpars)
# Finding the common parents
aux_commons = np.asarray(list(set(ipars).intersection(set(jpars))))
plevels = alllevels[aux_commons]
pi_idx = aux_commons[plevels == max(plevels)][0]
# Volume
wij = volumes[pi_idx]
WAs[0,imat,jmat] = wij
WAs[0,jmat,imat] = wij
# Luminosity
wij = luminosities[pi_idx]
WAs[1,imat,jmat] = wij
WAs[1,jmat,imat] = wij
return WAs
def guessk(Mat, thresh = 0.2):
"""
Guess the number of clusters by couting
the connected blocks in the affinity matrix
Parameters
-----------
Mat: numpy array
The rescaled affinity matrix to guess the
number of cluster from
thresh: float
The threshold to mask the matrix and count
the blocks
Return
-------
kguess: int
Number of guessed clusters
"""
M = 1*Mat
M[M < thresh] = 0
M[M > 0] = 1
np.fill_diagonal(M, 1)
guess_clusters = np.zeros(M.shape[0])
for i in range(M.shape[0]):
guess_clusters[i] = sum(M[i,:])
kguess = 1
i = 0
while i < len(guess_clusters)-1:
curr = int(guess_clusters[i])
if curr != 1:
kguess = kguess+1
i = i + curr
"""
np.fill_diagonal(M, 0)
D = np.zeros(M.shape)
for i in range(D.shape[0]):
D[i,i] = sum(M[i,:])
Lap = D - M
eigv = np.abs(np.linalg.eigvals(Lap))
kguess2 = len(np.where(eigv == 0)[0])
"""
return kguess
def clust_cleaning(allleavidx, allclusters, dictpars, dictchilds):
"""
Remove clusters that do not corresponds to
isolated dendrogram branches
Parameters
-----------
allleavidx: list
List of all leaf indexes within the
dendrogram
allclusters: list
List of dendrogram indexes that
correspond to significant objects
(i.e. clusters)
dictpars: dictionary
Parents and ancestors of all leaves
within the dendrogram
dictchilds: dictionary
Children of all branches
within the dendrogram
Return
-------
cores_idx: list
The final cluster dendrogram indexes
"""
cores_idx = []
for cluster in set(allclusters):
# Selecting the cluster
clust_idx = allclusters == cluster
# Leaves and levels in that cluster
clust_leaves_idx = np.asarray(allleavidx)[clust_idx]
all_par_list = []
for cli in clust_leaves_idx:
par_list = dictpars[str(cli)]
par_list = par_list[0:-1] #The lowest, the trunk, is not considered
all_par_list = all_par_list + par_list
all_par_list = list(set(all_par_list))
core_clust_num = []
clust_core_num = []
for i in range(len(all_par_list)):
sel_par = all_par_list[i]
core_leaves_idx = dictchilds[str(sel_par)]
# Leaves in the core but not in the cluster
core_clust = list(set(core_leaves_idx) - set(clust_leaves_idx))
core_clust_num.append(len(core_clust))
# Leaves in the cluster but not in the core
clust_core = list(set(clust_leaves_idx) & set(core_leaves_idx))
clust_core_num.append(len(clust_core))
# The selected core must not contain other leaves than
# those of the cluster, plus it is the one with the highest
# number of cluster leaves contained
core_clust_num = np.asarray(core_clust_num)
core_clust_num0 = np.where(core_clust_num == 0)
if len(core_clust_num0[0]) > 0:
all_par_list = np.asarray(all_par_list)
all_par_list0 = all_par_list[core_clust_num0]
all_par_list0 = np.asarray(all_par_list0)
clust_core_num = np.asarray(clust_core_num)
clust_core_num0 = clust_core_num[core_clust_num0]
clust_core_num0 = np.asarray(clust_core_num0)
max_num = max(clust_core_num0)
cores = all_par_list0[np.where(clust_core_num0 == max_num)]
cores_idx = cores_idx + cores.tolist()
else:
print "Unassignable cluster ", cluster
return cores_idx
def cloudstering(dendrogram, catalog, criteria, user_k, user_ams, user_scalpars, ssingle, locscal, blind):
"""
SCIMES main function. It collects parents/children
of all structrures within the dendrogram, and their
properties. It calls the affinity matrix-related
functions (for creation, rescaling, cluster counting),
and it runs several time the actual spectral clustering
routine by calculating every time the silhouette of the
current configuration. Input parameter are passed by the
SpectralCloudstering class.
Parameters
-----------
dendrogram: 'astrodendro.dendrogram.Dendrogram' instance
The dendrogram to clusterize
catalog: 'astropy.table.table.Table' instance
A catalog containing all properties of the dendrogram
structures. Generally generated with ppv_catalog module
criteria: list
Defaul clustering criteria to use,
0 = volume, 1 = luminosity
user_k: int
The expected number of clusters, if not provided
it will be guessed automatically through the eigenvalues
of the unsmoothed affinity matrix
user_ams: numpy array
User provided affinity matrix. Whether this is not
furnish it is automatically generated through the
volume and/or luminosity criteria
user_scalpars: list of floats
User-provided scaling parameters to smooth the
affinity matrices
locscal: bool
Smooth the affinity matrices using a local
scaling technique
ssingles: bool
Consider the single, isolated leaves as
individual 'clusters'. Useful for low
resolution data where the beam size
corresponds to the size of a Giant
Molecular Cloud
blind: bool
Show the affinity matrices.
Matplotlib required
Return
-------
clusts: list
The dendrogram branch indexes corresponding to the
identified clusters
AMs: numpy array
The affinity matrices calculated by the algorithm
escalpars: list
Estimated scaling parameters for the different
affinity matrixes
silhoutte: float
Silhouette of the best cluster configuration
"""
# Collecting all connectivity information into more handy lists
all_structures_idx = range(len(catalog['radius'].data))
all_levels = []
all_leav_names = []
all_leav_idx = []
all_brc_names = []
all_brc_idx = []
all_parents = []
all_children = []
trunk_brs_idx = []
two_clust_idx = []
mul_leav_idx = []
for structure_idx in all_structures_idx:
s = dendrogram[structure_idx]
all_levels.append(s.level)
# If structure is a leaf find all the parents
if s.is_leaf and s.parent != None:
par = s.parent
all_leav_names.append(str(s.idx))
parents = []
while par != None:
parents.append(par.idx)
par = par.parent
parents.append(len(catalog['radius'].data)) # This is the trunk!
all_parents.append(parents)
# If structure is a brach find all its leaves
if s.is_branch:
all_brc_idx.append(s.idx)
all_brc_names.append(str(s.idx))
children = []
for leaf in s.sorted_leaves():
children.append(leaf.idx)
all_children.append(children)
# Trunk branches
if s.parent == None:
trunk_brs_idx.append(s.idx)
all_leav_idx = all_leav_idx + children
if s.children[0].is_branch or s.children[1].is_branch:
mul_leav_idx = mul_leav_idx + children
else:
two_clust_idx.append(s.idx)
two_clust_idx = np.unique(two_clust_idx).tolist()
dict_parents = dict(zip(all_leav_names,all_parents))
dict_children = dict(zip(all_brc_names,all_children))
all_levels.append(-1)
all_levels = np.asarray(all_levels)
# Retriving needed properties from the catalog
volumes = catalog['volume'].data
luminosities = catalog['luminosity'].data
t_volume = sum(volumes[trunk_brs_idx])
t_luminosity = sum(luminosities[trunk_brs_idx])
volumes = volumes.tolist()
luminosities = luminosities.tolist()
volumes.append(t_volume)
luminosities.append(t_luminosity)
dict_props = {'volumes':volumes, 'luminosities':luminosities}
# Generating affinity matrices if not provided
if user_ams == None:
AMs = aff_matrix(all_leav_idx, all_levels, dict_parents, dict_props)
if blind == False:
# Showing the volume and luminosity affinity matrices
fig = plt.figure(figsize=(12, 6))
ax1 = fig.add_subplot(121)
m1 = ax1.imshow(AMs[0,:,:], interpolation = 'nearest')
ax1.set_title('"Volume" affinity matrix', fontsize = 'medium')
ax1.set_xlabel('leaf index')
ax1.set_ylabel('leaf index')
cb1 = plt.colorbar(m1)
ax2 = fig.add_subplot(122)
m2 = ax2.imshow(AMs[1,:,:], interpolation = 'nearest')
ax2.set_xlabel('leaf index')
ax2.set_ylabel('leaf index')
ax2.set_title('"Luminosity" affinity matrix', fontsize = 'medium')
cb2 = plt.colorbar(m2)
else:
AMs = user_ams
# Check if the affinity matrix has more than 2 elements
# otherwise return everything as clusters ("savesingles").
if AMs.shape[1] <= 2:
print '--- Not necessary to cluster. "savesingles" keyword triggered.'
all_leaves = []
for leaf in dendrogram.leaves:
all_leaves.append(leaf.idx)
clusts = all_leaves
return clusts, AMs
# Check whether the affinity matrix scaling parameter
# are provided by the user, if so use them, otherwise
# calculate them
if user_scalpars == None:
scpars = np.zeros(max(criteria)+1)
else:
scpars = user_scalpars
print "- Start spectral clustering"
# Selecting the criteria and merging the matrices
escalpars = []
for cr in criteria:
print "-- Rescaling ", cr, " matrix"
if criteria.index(cr) == 0:
AM, sigma = mat_smooth(AMs[cr,:,:], scalpar = scpars[cr], lscal = locscal)
escalpars.append(sigma)
else:
AMc, sigma = mat_smooth(AMs[cr,:,:], scalpar = scpars[cr], lscal = locscal)
AM = AM*AMc
escalpars.append(sigma)
# Making the reduced affinity matrices
mul_leav_mat = []
for mli in mul_leav_idx:
mul_leav_mat.append(all_leav_idx.index(mli))
mul_leav_mat = np.asarray(mul_leav_mat)
rAM = AM[mul_leav_mat,:]
rAM = rAM[:,mul_leav_mat]
if blind == False:
# Showing the final affinity matrix
plt.matshow(AM)
plt.colorbar()
plt.title('Final Affinity Matrix')
plt.xlabel('leaf index')
plt.ylabel('leaf index')
# Guessing the number of clusters
# if not provided
if user_k == 0:
kg = guessk(rAM)
else:
kg = user_k-len(two_clust_idx)
print '-- Guessed number of clusters =', kg+len(two_clust_idx)
if kg > 1:
# Find the best cluster number
sils = []
min_ks = max(2,kg-15)
max_ks = min(kg+15,rAM.shape[0]-1)
for ks in range(min_ks,max_ks):
try:
all_clusters, evecs = spectral_clustering(rAM, n_clusters=ks, assign_labels = 'kmeans', eigen_solver='arpack')
sil = metrics.silhouette_score(evecs, np.asarray(all_clusters), metric='euclidean')
except np.linalg.LinAlgError:
sil = 0
sils.append(sil)
# Use the best cluster number to generate clusters
best_ks = sils.index(max(sils))+min_ks
print "-- Best cluster number found through SILHOUETTE (", max(sils),")= ", best_ks+len(two_clust_idx)
silhoutte = max(sils)
all_clusters, evecs = spectral_clustering(rAM, n_clusters=best_ks, assign_labels = 'kmeans', eigen_solver='arpack')
else:
print '-- Not necessary to cluster'
all_clusters = np.zeros(len(all_leaves_idx), dtype = np.int32)
clust_branches = clust_cleaning(mul_leav_idx, all_clusters, dict_parents, dict_children)
clusts = clust_branches + two_clust_idx
print "-- Final cluster number (after cleaning)", len(clusts)
# Add to the cluster list the single leaves, if required
if ssingle == True:
all_leaves = []
clust_leaves = []
for leaf in dendrogram.leaves:
all_leaves.append(leaf.idx)
for clust in clusts:
for leaf in dendrogram[clust].sorted_leaves():
clust_leaves.append(leaf.idx)
unclust_leaves = list(set(all_leaves)-set(clust_leaves))
clusts = clusts + unclust_leaves
print "-- Unclustered leaves added. Final cluster number", len(clusts)
return clusts, AMs, escalpars, silhoutte
[docs]class SpectralCloudstering(object):
"""
Apply the spectral clustering to find the best
cloud segmentation out from a dendrogram.
Parameters
-----------
dendrogram: 'astrodendro.dendrogram.Dendrogram' instance
The dendrogram to clusterize
catalog: 'astropy.table.table.Table' instance
A catalog containing all properties of the dendrogram
structures. Generally generated with ppv_catalog module
cl_volume: bool
Clusterize the dendrogram using the 'volume' criterium
cl_luminosity: bool
Clusterize the dendrogram using the 'luminosity' criterium
user_k: int
The expected number of clusters, if not provided
it will be guessed automatically through the eigenvalues
of the unsmoothed affinity matrix
user_ams: numpy array
User provided affinity matrix. Whether this is not
furnish it is automatically generated through the
volume and/or luminosity criteria
user_scalpars: list of floats
User-provided scaling parameters to smooth the
affinity matrices
locscaling: bool
Smooth the affinity matrices using a local
scaling technique
savesingles: bool
Consider the single, isolated leaves as
individual 'clusters'. Useful for low
resolution data where the beam size
corresponds to the size of a Giant
Molecular Cloud.
blind: bool
Show the affinity matrices.
Matplotlib required.
Return
-------
clusters: list
The dendrogram branch indexes corresponding to the
identified clusters
affmats: numpy array
The affinity matrices calculated by the algorithm
escalpars: list
Estimated scaling parameters for the different
affinity matrixes
silhouette: float
Silhouette of the best cluster configuration
"""
def __init__(self, dendrogram, catalog, cl_volume = True, cl_luminosity=True,
user_k = None, user_ams = None, user_scalpars = None,
savesingles = False, locscaling = False, blind = False):
self.dendrogram = dendrogram
self.catalog = catalog
if 'luminosity' not in catalog.colnames:
print("WARNING: adding luminosity = flux to the catalog.")
catalog.add_column(Column(catalog['flux'], 'luminosity'))
if 'volume' not in catalog.colnames:
print("WARNING: adding volume = pi * radius^2 * v_rms to the catalog.")
catalog.add_column(Column(catalog['radius']**2*np.pi *
catalog['v_rms'], 'volume'))
self.cl_volume = cl_volume
self.cl_luminosity = cl_luminosity
self.user_k = user_k or 0
self.user_ams = user_ams
self.user_scalpars = user_scalpars
self.locscaling = locscaling
self.savesingles = savesingles
self.blind = blind
# Clustering criteria chosen
self.criteria = []
if self.cl_volume:
self.criteria.append(0)
if self.cl_luminosity:
self.criteria.append(1)
if self.cl_volume == False:
print("WARNING: clustering will be performed on the Luminosity matrix only")
if self.cl_luminosity == False:
print("WARNING: clustering will be performed on the Volume matrix only")
if self.cl_luminosity and self.cl_volume:
print("WARNING: clustering will be performed on the Aggregated matrix")
# Default colors in case plot_connected_colors is called before showdendro
self.colors = cycle('rgbcmyk')
self.clusters, self.affmats, self.escalpars, self.silhouette = cloudstering(self.dendrogram,
self.catalog,
self.criteria,
self.user_k, self.user_ams, self.user_scalpars,
self.savesingles, self.locscaling, self.blind)
[docs] def showdendro(self):
"""
Show the clustered dendrogram
every color correspond to a
different cluster
"""
dendro = self.dendrogram
cores_idx = self.clusters
# For the random colors
r = lambda: random.randint(0,255)
p = dendro.plotter()
fig = plt.figure(figsize=(14, 8))
ax = fig.add_subplot(111)
ax.set_yscale('log')
cols = []
# Plot the whole tree
p.plot_tree(ax, color='black')
for i in range(len(cores_idx)):
col = '#%02X%02X%02X' % (r(),r(),r())
cols.append(col)
p.plot_tree(ax, structure=dendro[cores_idx[i]], color=cols[i], lw=3)
ax.set_title("Final clustering configuration")
ax.set_xlabel("Structure")
ax.set_ylabel("Flux")
self.colors = cols
[docs] def plot_connected_clusters(self, **kwargs):
from plotting import dendroplot_clusters
return dendroplot_clusters(self.clusters, self.dendrogram, self.catalog,
colors=self.colors,
**kwargs)
[docs] def asgncube(self, header, collapse = True):
"""
Create a label cube with only the cluster (cloudster) IDs included, and
write to disk.
Parameters
----------
header : `fits.Header`
The header of the output assignment cube. Should be the same
header that the dendrogram was generated from
collapse : bool
Collapsed (2D) version of the assignment cube
Return
-------
asgn = 'astropy.io.fits.PrimaryHDU' instance
Cube of labels
"""
data = self.dendrogram.data.squeeze()
dendro = self.dendrogram
# Making the assignment cube
asgn = np.zeros(data.shape, dtype=np.int32)
for i in self.clusters:
asgn[np.where(dendro[i].get_mask(shape = asgn.shape))] = i+1
# Write the fits file
self.asgn = fits.PrimaryHDU(asgn.astype('short'), header)
# Collapsed version of the asgn cube
if collapse:
asgn_map = np.amax(self.asgn.data, axis = 0)
plt.matshow(asgn_map, origin = "lower")
cbar = plt.colorbar()
cbar.ax.get_yaxis().labelpad = 15
cbar.ax.set_ylabel('Structure label', rotation=270)