In [1]:
import numpy as np
import os
import pandas as pd
import h5py

from astrometry.util.fits import fits_table, merge_tables

# To plot pretty figures
%matplotlib inline
#%matplotlib notebook

# to make this notebook's output stable across runs
def reset_graph(seed=7):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

import tensorflow as tf

%load_ext autoreload
%autoreload 2
/Users/kaylan1/miniconda3/envs/mlbook/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)
In [30]:
from obiwan.qa.visual import plotImage, readImage, sliceImage

Visualize the training data

read the hdf5 files

In [123]:
f_dr5= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/img_ivar_grz.hdf5' % 'dr5'),
                    'r')
f_dr5_jpeg= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/jpeg_grz.hdf5' % 'dr5'),
                    'r')
dr5_sorted_ids= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/sorted_ids.fits' % 'dr5'))

f_sim= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/img_ivar_grz.hdf5' % 'sim'),
                 'r')

f_sim_jpeg= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/jpeg_grz.hdf5' % 'sim'),
                 'r')
sim_sorted_ids= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/sorted_ids.fits' % 'sim'))
In [124]:
keys=dict(dr5=[key for key in f_dr5.keys()],
          sim=[key for key in f_sim.keys()])
          #jpeg=list(f_sim_jpeg.keys()))
len(keys['dr5']),len(dr5_sorted_ids),len(keys['sim']),len(sim_sorted_ids)
Out[124]:
(880, 880, 259, 259)
In [125]:
f_sim['/%s/ivar' % keys['sim'][i]].shape
Out[125]:
(64, 64, 3)
In [126]:
print(sim_sorted_ids.get_columns())
for T in sim_sorted_ids[:10]: print(T.id,T.mag_g)
['id', 'mag_g']
103080843 21.3871759879
370074523 21.9118273261
114185542 22.1511480357
106085338 22.2520302338
61604085 22.4234469734
243535310 22.4476237756
156101043 22.516784148
8341765 22.5456832241
114521399 22.5477962344
85407414 22.6195802363
In [130]:
_= plt.hist(sim_sorted_ids.mag_g,histtype='step',color='b')
_= plt.hist(dr5_sorted_ids.mag_g,histtype='step',color='g')
../_images/nb_CNN_8_0.png

For each sim mag_g, draw the nearest dr5 mag_g, without replacements

In [151]:
dr5_mags= dr5_sorted_ids.mag_g.copy()
id_dr5_draw= []
for sim_mag in sim_sorted_ids.mag_g:
    i= np.argmin(np.abs(dr5_mags - sim_mag))
    id_dr5_draw.append(dr5_sorted_ids.id[i])
    len_i= len(dr5_mags)
    dr5_mags= np.delete(dr5_mags,dr5_mags[i])
    assert(len_i == len(dr5_mags)+1)
/Users/kaylan1/miniconda3/envs/mlbook/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning: using a non-integer array as obj in delete will result in an error in the future
  import sys
In [157]:
for i in range(20):
    print(dr5_sorted_ids[dr5_sorted_ids.id == id_dr5_draw[i]].mag_g, sim_sorted_ids.mag_g[i])
[ 21.39718819] 21.3871759879
[ 21.90782928] 21.9118273261
[ 22.14355278] 22.1511480357
[ 22.217062] 22.2520302338
[ 22.41222191] 22.4234469734
[ 22.4383316] 22.4476237756
[ 22.48746109] 22.516784148
[ 22.48746109] 22.5456832241
[ 22.48746109] 22.5477962344
[ 22.5747509] 22.6195802363
[ 22.62215424] 22.6675529884
[ 22.62044716] 22.6677327543
[ 22.63945389] 22.694754685
[ 22.64107895] 22.7006423466
[ 22.69321251] 22.7392880101
[ 22.69935799] 22.750211746
[ 22.69935799] 22.7524777029
[ 22.7031498] 22.7607775988
[ 22.71958923] 22.7808052859
[ 22.71958923] 22.7834926551
In [191]:
import matplotlib as mpl
fontsize= 15
mpl.rcParams['axes.titlesize'] = fontsize
mpl.rcParams['axes.labelsize']= fontsize
mpl.rcParams['font.size']= fontsize
In [221]:
def fake_real_mosaic(i_start, nrow=8,ncol=7):
    fig,ax= plt.subplots(nrow,ncol,figsize=(ncol*1.5,1.5*nrow))
    plt.subplots_adjust(hspace=0.01,wspace=0.01)
    d_text= dict(rotation='horizontal',
               horizontalalignment='left',
                verticalalignment='center')

    # Real
    i_real=i_start-1
    for row in np.arange(0,nrow,2):
        i_real+=1
        plotImage().imshow(f_dr5_jpeg['/%s/img' % id_dr5_draw[i_real]],ax[row,0])
        for iband,col in enumerate(range(1,4)):
            plotImage().imshow(f_dr5['/%s/img' % id_dr5_draw[i_real]][:,:,iband].T,ax[row,col],qs=[3,97])
        for iband,col in enumerate(range(4,7)):
            plotImage().imshow(f_dr5['/%s/ivar' % id_dr5_draw[i_real]][:,:,iband].T,ax[row,col],qs=[3,97])
        ax[row,-1].text(1.05,0.5,'%.2f' % dr5_sorted_ids[dr5_sorted_ids.id == id_dr5_draw[i_real]].mag_g,
                        transform=ax[row,-1].transAxes,**d_text)
    # Fake
    i_fake=i_start-1
    for row in np.arange(1,nrow,2):
        i_fake+=1
        plotImage().imshow(f_sim_jpeg['/%s/img' % sim_sorted_ids.id[i_fake]],ax[row,0])
        for iband,col in enumerate(range(1,4)):
            plotImage().imshow(f_sim['/%s/img' % sim_sorted_ids.id[i_fake]][:,:,iband].T,ax[row,col],qs=[3,97])
        for iband,col in enumerate(range(4,7)):
            plotImage().imshow(f_sim['/%s/ivar' % sim_sorted_ids.id[i_fake]][:,:,iband].T,ax[row,col],qs=[3,97])
        ax[row,-1].text(1.05,0.5,'%.2f' % sim_sorted_ids.mag_g[i_fake],
                        transform=ax[row,-1].transAxes,**d_text)


    for row in range(nrow):
        for col in range(ncol):
            ax[row,col].set_xticks([])
            ax[row,col].set_yticks([])

    for col,name in zip(range(ncol),
                        ['g+r+z','g','r','z','ivar(g)','ivar(r)','ivar(z)']):
        tlab= ax[0,col].set_title(name)
    mlab= ax[0,-1].text(1.05,1.1,'mag(g)',
                        transform=ax[0,-1].transAxes,**d_text)
    d_lab= dict(rotation='horizontal',
               horizontalalignment='right',
               verticalalignment='center')
    for row in np.arange(0,nrow,2):
        ylab=ax[row,0].set_ylabel("R",**d_lab)
    for row in np.arange(1,nrow,2):
        ax[row,0].set_ylabel("F",**d_lab)

    plt.savefig('fake_real_mosaic_istart_%d.png' % i_start,
                bbox_extra_artists=[tlab,ylab,mlab], bbox_inches='tight')

Mosaic of training images

In [225]:
# Brightest
fake_real_mosaic(0, nrow=8,ncol=7)
../_images/nb_CNN_15_0.png
In [223]:
# Faintest
fake_real_mosaic(len(sim_sorted_ids)-int(nrow/2)-1, nrow=8,ncol=7)
../_images/nb_CNN_16_0.png
In [224]:
# Median
fake_real_mosaic(len(sim_sorted_ids)//2, nrow=8,ncol=7)
../_images/nb_CNN_17_0.png
In [ ]:
# Loop over all sim sources
for istart in np.arange(0,len(sim_sorted_ids),8):
    fake_real_mosaic(istart, nrow=8,ncol=7)
/Users/kaylan1/miniconda3/envs/mlbook/lib/python3.6/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-227-a6e3b923c3b1> in <module>()
      1 for istart in np.arange(0,len(sim_sorted_ids),8):
----> 2     fake_real_mosaic(istart, nrow=8,ncol=7)

<ipython-input-221-ad060ee414cb> in fake_real_mosaic(i_start, nrow, ncol)
     10     for row in np.arange(0,nrow,2):
     11         i_real+=1
---> 12         plotImage().imshow(f_dr5_jpeg['/%s/img' % id_dr5_draw[i_real]],ax[row,0])
     13         for iband,col in enumerate(range(1,4)):
     14             plotImage().imshow(f_dr5['/%s/img' % id_dr5_draw[i_real]][:,:,iband].T,ax[row,col],qs=[3,97])

IndexError: list index out of range
../_images/nb_CNN_18_2.png
../_images/nb_CNN_18_3.png
../_images/nb_CNN_18_4.png
../_images/nb_CNN_18_5.png
../_images/nb_CNN_18_6.png
../_images/nb_CNN_18_7.png
../_images/nb_CNN_18_8.png
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
../_images/nb_CNN_18_10.png
../_images/nb_CNN_18_11.png
../_images/nb_CNN_18_12.png
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
../_images/nb_CNN_18_14.png
../_images/nb_CNN_18_15.png
../_images/nb_CNN_18_16.png
../_images/nb_CNN_18_17.png
../_images/nb_CNN_18_18.png
../_images/nb_CNN_18_19.png
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
../_images/nb_CNN_18_21.png
../_images/nb_CNN_18_22.png
../_images/nb_CNN_18_23.png
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
../_images/nb_CNN_18_25.png
../_images/nb_CNN_18_26.png
In [12]:
from glob import glob
fns= glob(os.path.join(os.environ['HOME'],'Downloads',
                'dr5_testtrain/testtrain/121/1211p060/xtrain_*.npy'))
xtrain= [np.load(fn) for fn in fns]
np.vstack(xtrain).shape
Out[12]:
(1408, 64, 64, 6)
In [34]:
jpg= readImage(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/legacysurvey-1211p060-image.jpg'),jpeg=True)
sliceImage(jpg,xslice=slice(10,20),yslice=slice(10,20)).shape
Out[34]:
(10, 10, 3)
In [29]:
os.path.basename('/global/cscratch1/sd/kaylanb/obiwan_out/elg_dr5_coadds/coadd/121/1211p060/rs0')
Out[29]:
'rs0'

Load training data (.npy files created with “split_traintest.py” script)

In [7]:
dr= os.path.join(os.environ['HOME'],'Downloads',
                'dr5_testtrain/testtrain/121/1211p060')
xtrain= np.load(os.path.join(dr,'xtrain_1.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_1.npy'))
xtrain.shape,ytrain.shape,len(ytrain[ytrain == 0]),set(ytrain)

Out[7]:
((512, 64, 64, 6), (512,), 250, {0.0, 1.0})
In [26]:
isFake= ytrain == 1
img_inds= [0,2,4]

nrow,ncol=2,8
for band,iband in zip('grz',img_inds):
    fig,ax= plt.subplots(nrow,ncol,figsize=(10,2))
    i=-1
    for row in range(nrow):
        for col in range(ncol):
            i+=1
            plotImage().imshow(xtrain[isFake][i,:,:,iband],ax[row,col])
    fig.suptitle('%s-band' % band)
../_images/nb_CNN_24_0.png
../_images/nb_CNN_24_1.png
../_images/nb_CNN_24_2.png
In [28]:
img_inds= np.array([0,2,4])+1

nrow,ncol=2,8
for band,iband in zip('grz',img_inds):
    fig,ax= plt.subplots(nrow,ncol,figsize=(10,2))
    i=-1
    for row in range(nrow):
        for col in range(ncol):
            i+=1
            plotImage().imshow(xtrain[isFake][i,:,:,iband],ax[row,col])
    fig.suptitle('ivar-%s' % band)
../_images/nb_CNN_25_0.png
../_images/nb_CNN_25_1.png
../_images/nb_CNN_25_2.png

Build the CNN

In [31]:
xtrain[0,...].shape, xtrain.dtype,ytrain.dtype
Out[31]:
((64, 64, 6), dtype('float32'), dtype('float64'))
In [67]:
# Design:
# input, 3x(conv + avg pool), 2x(fc)

height,width,channels = (64,64,6) #images_real.shape

reset_graph()

conv_kwargs= dict(strides=1,
                  padding='SAME',
                  activation=tf.nn.relu)
pool_kwargs= dict(ksize= [1,2,2,1],
                  strides=[1,2,2,1],
                  padding='VALID')

with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None,height,width,channels], name="X") #training data shape
    #X_reshaped = tf.reshape(X, shape=[-1, height, width, channels])
    y = tf.placeholder(tf.int32, shape=[None], name="y")

init = tf.global_variables_initializer()

# 64x64
with tf.name_scope("layer1"):
    conv1 = tf.layers.conv2d(X, filters=2*channels, kernel_size=7,
                             **conv_kwargs)
    pool1 = tf.nn.avg_pool(conv1, **pool_kwargs)

# 32x32
with tf.name_scope("layer2"):
    conv2 = tf.layers.conv2d(pool1, filters=4*channels, kernel_size=7,
                             **conv_kwargs)
    pool2 = tf.nn.avg_pool(conv2, **pool_kwargs)

# 16x16
with tf.name_scope("layer3"):
    conv3 = tf.layers.conv2d(pool2, filters=8*channels, kernel_size=7,
                             **conv_kwargs)
    pool3 = tf.nn.avg_pool(conv3, **pool_kwargs)
    # next is fc
    pool3_flat = tf.reshape(pool3,
                            shape=[-1, pool3.shape[1] * pool3.shape[2] * pool3.shape[3]])


with tf.name_scope("fc"):
    fc = tf.layers.dense(pool3_flat, 64, activation=tf.nn.relu, name="fc")

with tf.name_scope("output"):
    logits = tf.layers.dense(fc, 2, name="output") # 2 classes
    Y_proba = tf.nn.softmax(logits, name="Y_proba")

with tf.name_scope("train"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(xentropy)
    optimizer = tf.train.AdamOptimizer()
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

loss_summary= tf.summary.scalar('loss', loss)
accur_summary = tf.summary.scalar('accuracy', accuracy)
In [75]:
logits.op.name, loss.op.name, accuracy.op.name
Out[75]:
('output/output/BiasAdd', 'train/Mean', 'eval/Mean')
In [68]:
def BatchGen(X,y,batch_size=32):
    # if not perfect divide, will drop extra training instances
    N= X.shape[0]
    ind= np.array_split(np.arange(N),N // batch_size)
    for i in ind:
        yield X[i,...],y[i].astype(np.int32) #.reshape(-1,1).astype(np.int32)

a=BatchGen(xtrain,ytrain,batch_size=32)
for X_,y_ in a:
    print('batch:',X_.shape,y_.shape)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
In [69]:
from datetime import datetime

def get_logdir(root_logdir):
    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    return "{}/run-{}/".format(root_logdir, now)

Layers (check that size of each layer makes sense)

In [70]:
batch_size=32
X_,y_= xtrain[:batch_size,...],ytrain[:batch_size].astype(np.int32)

with tf.Session() as sess:
    sess.run(init)
    print(sess.run(X, feed_dict={X: X_}).shape)
    print(sess.run(conv1, feed_dict={X: X_}).shape)
    print(sess.run(pool1, feed_dict={X: X_}).shape)
    print(sess.run(conv2, feed_dict={X: X_}).shape)
    print(sess.run(pool2, feed_dict={X: X_}).shape)
    print(sess.run(conv3, feed_dict={X: X_}).shape)
    print(sess.run(pool3, feed_dict={X: X_}).shape)
    print(sess.run(pool3_flat, feed_dict={X: X_}).shape)
    print(sess.run(fc, feed_dict={X: X_}).shape)
    print(sess.run(logits, feed_dict={X: X_}).shape)
(32, 64, 64, 6)
(32, 64, 64, 12)
(32, 32, 32, 12)
(32, 32, 32, 24)
(32, 16, 16, 24)
(32, 16, 16, 48)
(32, 8, 8, 48)
(32, 3072)
(32, 64)
(32, 2)

Train (4 epochs)

In [72]:
xtrain= np.load(os.path.join(dr,'xtrain_1.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_1.npy'))
file_writer = tf.summary.FileWriter(get_logdir(os.path.join(os.environ['HOME'],'Downloads',
                                                "cnn",'logs')),
                                    tf.get_default_graph())

n_epochs = 4
batch_size = 32
n_batches= ytrain.shape[0]//batch_size + 1

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        data_gen= BatchGen(xtrain,ytrain,batch_size)
        batch_index=0
        for X_,y_ in data_gen:
            sess.run(training_op, feed_dict={X: X_, y: y_})
            batch_index+=1
            if i % 1 == 0:
                step = epoch * n_batches + batch_index
                file_writer.add_summary(loss_summary.eval(feed_dict={X: X_, y: y_}),
                                        step)
                file_writer.add_summary(accur_summary.eval(feed_dict={X: X_, y: y_}),
                                        step)

        acc_train = accuracy.eval(feed_dict={X: X_, y: y_})
        print(epoch, "Train accuracy:", acc_train)
        if epoch % 2 == 0:
            save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
                                                  "cnn/checkpoint.ckpt"))
    save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
                                              "cnn/final.ckpt"))
0 Train accuracy: 0.46875
1 Train accuracy: 0.78125
2 Train accuracy: 0.75
3 Train accuracy: 0.84375

Restart from checkpoints

In [59]:
xtrain= np.load(os.path.join(dr,'xtrain_2.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_2.npy'))
with tf.Session() as sess:
    # Equiv of sess.run(init)
    saver.restore(sess, os.path.join(os.environ['HOME'],'Downloads',
                                              "cnn/final.ckpt"))
    #print('resorted model has accur=',accuracy.eval())
    for epoch in range(n_epochs):
        data_gen= BatchGen(xtrain,ytrain,batch_size)
        for X_,y_ in data_gen:
            sess.run(training_op, feed_dict={X: X_, y: y_})
        acc_train = accuracy.eval(feed_dict={X: X_, y: y_})
        print(epoch, "Train accuracy:", acc_train)
        if epoch % 2 == 0:
            save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
                                                  "cnn/checkpoint.ckpt"))
    save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
                                              "cnn/final.ckpt"))
INFO:tensorflow:Restoring parameters from /Users/kaylan/Downloads/cnn/final.ckpt
0 Train accuracy: 0.90625
1 Train accuracy: 0.90625
2 Train accuracy: 0.90625
3 Train accuracy: 0.90625
In [4]:
f_real= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'dr5_hdf5/hdf5/121/1211p060/img_ivar_grz.hdf5'),
                    'r') #1165p107
dr5_tractor= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'dr5_hdf5/hdf5/121/1211p060/tractor-1211p060.fits'))


f_fake= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'elg_dr5_coadds/hdf5/121/1211p060/img_ivar_grz.hdf5'),
                    'r')
simcat= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'elg_dr5_coadds/hdf5/121/1211p060',
                    'simcat-elg-1211p060-rsALL.fits'))

print(len(f_real.keys()),len(f_fake.keys()),np.min([len(f_real.keys()),len(f_fake.keys())]))
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-4-45479c8404b2> in <module>()
      1 f_real= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
      2                     'dr5_hdf5/hdf5/121/1211p060/img_ivar_grz.hdf5'),
----> 3                     'r') #1165p107
      4 dr5_tractor= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
      5                     'dr5_hdf5/hdf5/121/1211p060/tractor-1211p060.fits'))

~/miniconda3/envs/tensorflow/lib/python3.5/site-packages/h5py/_hl/files.py in __init__(self, name, mode, driver, libver, userblock_size, swmr, **kwds)
    267             with phil:
    268                 fapl = make_fapl(driver, libver, **kwds)
--> 269                 fid = make_fid(name, mode, userblock_size, fapl, swmr=swmr)
    270
    271                 if swmr_support:

~/miniconda3/envs/tensorflow/lib/python3.5/site-packages/h5py/_hl/files.py in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
     97         if swmr and swmr_support:
     98             flags |= h5f.ACC_SWMR_READ
---> 99         fid = h5f.open(name, flags, fapl=fapl)
    100     elif mode == 'r+':
    101         fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/h5f.pyx in h5py.h5f.open()

OSError: Unable to open file (unable to open file: name = '/Users/kaylan/DOWNLOADS/dr5_hdf5/hdf5/121/1211p060/img_ivar_grz.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
In [87]:
def get_data(f,num=128):
    """Returns numpy array (num,64,64,6)"""
    return np.array([np.stack([f[key+'/img'],f[key+'/ivar']],axis=-1).reshape((64,64,6))
                     for key in list(f.keys())[:num]])

def get_data_imgonly(f,num=128):
    """Returns numpy array (num,64,64,3)"""
    return np.array([np.reshape(f[key+'/img'],(64,64,3))
                     for key in list(f.keys())[:num]])

real= get_data_imgonly(f_real,n)
real.shape
Out[87]:
(128, 64, 64, 6)
In [92]:
inHdf5= pd.Series(dr5_tractor.objid).isin(list(f_real.keys()))
print(len(dr5_tractor[inHdf5]),len(list(f_real.keys())))
dr5_tractor.cut(inHdf5)
# flux_gt_0= ((inHdf5) &
#             (dr5_tractor.flux_g > 0) &
#             (dr5_tractor.flux_r > 0) &
#             (dr5_tractor.flux_z > 0))
# dr5_tractor.cut(flux_gt_0)
880 880
In [93]:
d= {b:dr5_tractor.get('flux_'+b)
    for b in 'grz'}
df= pd.DataFrame(d)
df= df.apply(lambda x: x.values.byteswap().newbyteorder())
df.describe()
Out[93]:
g r z
count 880.000000 880.000000 880.000000
mean 0.761698 1.529735 2.634772
std 0.746897 1.434992 2.366555
min 0.145401 0.261415 0.613148
25% 0.282475 0.506985 1.009587
50% 0.447857 0.891814 1.629145
75% 0.962490 2.082288 3.528070
max 4.510773 6.133409 14.447383
In [96]:
def flux2mag(nmgy):
    return -2.5 * (np.log10(nmgy) - 9)

assert('g' not in dr5_tractor.get_columns())
assert('g' not in simcat.get_columns())
for b in 'grz':
    dr5_tractor.set(b, flux2mag(dr5_tractor.get('flux_'+b)/dr5_tractor.get('mw_transmission_'+b)))
    simcat.set(b, flux2mag(simcat.get(b+'flux')))
dr5_tractor.set('gr', dr5_tractor.g - dr5_tractor.r)
dr5_tractor.set('rz', dr5_tractor.r - dr5_tractor.z)
simcat.set('gr', simcat.g - simcat.r)
simcat.set('rz', simcat.r - simcat.z)

Real vs. Fake galaxies in TS box

In [99]:
fig,ax=plt.subplots(1,3,figsize=(12,4))
ax[0].scatter(dr5_tractor.rz,dr5_tractor.gr,alpha=0.3,c='b',label='dr5')
ax[1].scatter(simcat.rz, simcat.gr,alpha=0.3,c='g',label='sim')
ax[2].scatter(simcat.rz, simcat.gr,alpha=0.3,c='g',label='sim')
ax[2].scatter(dr5_tractor.rz,dr5_tractor.gr,alpha=0.3,c='b',label='dr5')
# ax.plot(pad['x1'],pad['y1'],'r--')
# ax.plot(pad['x2'],pad['y2'],c='r',ls='--',lw=2)
# ax.plot(pad['x3'],pad['y3'],c='r',ls='--',lw=2)
# ax.plot(pad['x4'],pad['y4'],c='r',ls='--',lw=2)
for i in range(3):
    ax[i].set_xlabel('r-z')
    ax[i].set_ylabel('g-r')
    ax[i].set_ylim(-0.3,2)
    ax[i].set_xlim(-0.6,2.2)
    ax[i].legend()
../_images/nb_CNN_46_0.png

Real galaxies are brighter

In [97]:
import seaborn as sns
fig,ax= plt.subplots(1,3,figsize=(12,3))
for i,b in enumerate('grz'):
    sns.distplot(dr5_tractor.get(b),kde=False,ax=ax[i],color='b')
    sns.distplot(simcat.get(b),kde=False,ax=ax[i],color='g')
    ax[i].set_xlabel(b + ' mag')

../_images/nb_CNN_48_0.png

apply cuts at bright end

In [101]:
def inRegion(rz,gr):
    return ((rz > 0.5) &
            (rz < 1.) &
            (gr > 0.5) &
            (gr < 1.))

def isFaint(g,r,z):
    return ((g > 23.) &
            (r > 22.5) &
            (z > 21.5))

dr5_keep= ((inRegion(dr5_tractor.rz,dr5_tractor.gr)) &
           (isFaint(dr5_tractor.g,dr5_tractor.r,dr5_tractor.z)))
sim_keep= ((inRegion(simcat.rz, simcat.gr)) &
           (isFaint(simcat.g,simcat.r,simcat.z)))


fig,ax= plt.subplots(1,3,figsize=(12,3))
for i,b in enumerate('grz'):
    sns.distplot(dr5_tractor.get(b)[dr5_keep],kde=False,ax=ax[i],color='b')
    sns.distplot(simcat.get(b)[sim_keep],kde=False,ax=ax[i],color='g')
    ax[i].set_xlabel(b + ' mag')


fig,ax= plt.subplots(1,3,figsize=(8,3))
ax[0].scatter(dr5_tractor.rz[dr5_keep], dr5_tractor.gr[dr5_keep],alpha=0.3,c='b',label='dr5')
ax[1].scatter(simcat.rz[sim_keep], simcat.gr[sim_keep],alpha=0.3,c='g',label='sim')
for i in range(3):
    ax[i].set_xlabel('r-z')
    ax[i].set_ylabel('g-r')
    ax[i].set_ylim(-0.3,2)
    ax[i].set_xlim(-0.6,2.2)
    ax[i].legend()

print(len(dr5_tractor[dr5_keep]), len(simcat[sim_keep]))
/Users/kaylan1/miniconda3/envs/mlbook/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "
135 544
../_images/nb_CNN_50_2.png
../_images/nb_CNN_50_3.png
In [102]:
from scipy import spatial
dr5_grz= np.array([[g,r,z]
                  for g,r,z in zip(dr5_tractor.g[dr5_keep],
                                   dr5_tractor.r[dr5_keep],
                                   dr5_tractor.z[dr5_keep])])
sim_grz= np.array([[g,r,z]
                  for g,r,z in zip(simcat.g[sim_keep],
                                   simcat.r[sim_keep],
                                   simcat.z[sim_keep])])

print(dr5_grz.shape, sim_grz.shape)
dr5_tree = spatial.KDTree(dr5_grz)
(135, 3) (544, 3)
In [103]:
d,ind=dr5_tree.query(sim_grz,p=1)
print(len(ind))
plt.hist(d)
544
Out[103]:
(array([  19.,   47.,  102.,  133.,  120.,   57.,   47.,   13.,    5.,    1.]),
 array([ 0.00694428,  0.03292467,  0.05890506,  0.08488544,  0.11086583,
         0.13684621,  0.1628266 ,  0.18880698,  0.21478737,  0.24076775,
         0.26674814]),
 <a list of 10 Patch objects>)
../_images/nb_CNN_54_2.png