#!/usr/local/bin/python
# -- coding: utf-8 --
‘’’3D convolutional neural network trained
to reduce the False Positive Rate for the LUNA datasets.
The LUNA datasets are organized in the CIFAR architecture.
Author: Kong Haiyang
‘’’
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import time
import math
import numpy as np
from six.moves import xrange
import tensorflow as tf
import pprocess
import csv
IMAGE_SIZE = 40
PIXELDATA_SIZE = 4
NUM_CHANNELS = 1
NUM_LABELS = 2
SEED = 66478
BATCH_SIZE = 128
NUM_EPOCHS = 10
EVAL_BATCH_SIZE = 64
XAVIER_INIT = tf.contrib.layers.xavier_initializer(seed=SEED)
class lunaDataSet:
data_filename = “”
csv_filename = “”
all_count = test_count = val_count = train_count = 0
val_no_aug_pos = []
test_no_aug_pos = []
train_batch_index = [0]
is_parallel = False
nproc = 1
__fptr = None
__csv_lines = 0
__columns, __rows, __heights, __byte_length = IMAGE_SIZE, IMAGE_SIZE, IMAGE_SIZE, PIXELDATA_SIZE
__t_length = 1 + __columns __rows __heights * __byte_length
__num_of_read = __train_rec_index = __test_rec_index = __val_rec_index = 0L
def __init__(self, datafile, csvfile, is_parallel=True):
self.data_filename = datafile
self.csv_filename = csvfile
self.__fptr = open(self.data_filename, ‘rb’)
self.__csv_lines = readCSV(self.csv_filename)[1:]
self.is_parallel = is_parallel
if (self.is_parallel):
self.nproc = pprocess.get_number_of_cores() - 1
def __del__(self):
self.fptr.close()
csvLines = 0
def getDataNum(self, testNo, valNo, train_group_size=0):
self.test_count = self.val_count = self.train_count = 0
self.val_no_aug_pos = []
self.test_no_aug_pos = []
for line in self.__csv_lines:
if line[1] == testNo:
if line[-2] == ‘1’ and line[-1] == ‘0’:
self.test_no_aug_pos.append(self.test_count)
self.test_count += 1
elif line[1] == valNo:
if line[-2] == ‘1’ and line[-1] == ‘0’:
self.val_no_aug_pos.append(self.val_count)
self.val_count += 1
else:
self.train_count += 1
if (train_group_size != 0 and self.train_count % train_group_size == 0):
self.train_batch_index.append(
self.test_count + self.val_count + self.train_count)
self.all\_count = self.test\_count + self.val\_count + self.train\_count
if (train\_group\_size != 0 and self.train\_count % train\_group_size != 0):
self.train\_batch\_index.append(self.all_count)
def __getValData(self, testNo, valNo, train_group_size=0):
tempfp = self.__fptr.tell() # will overflow if tempfp is too big.
if (0 == self.all_count):
self.getDataNum(testNo, valNo, train\_group\_size)
val\_data = np.empty(\[self.val\_count, self.\_\_rows, self.\_\_columns,
self.__heights, 1\], dtype=float)
val\_labels = np.empty(self.val\_count, dtype=int)
vali = 0
self.__fptr.seek(0)
relative_Offset = 0
for i in range(self.all_count):
line = self.\_\_csv\_lines\[i\]
if line\[1\] == valNo:
if (relative_Offset > 0):
self.\_\_fptr.seek(relative\_Offset * self.\_\_t\_length, 1)
bufCIFAR = self.\_\_fptr.read(self.\_\_t_length)
val_labels\[vali\] = np.frombuffer(
bufCIFAR\[0\], dtype=np.uint8).astype(np.int64)
val_data\[vali, ...\] = (np.frombuffer(
bufCIFAR\[1:\], dtype=np.float32)).reshape(self.\_\_rows, self.\_\_columns, self.__heights, 1)
vali += 1
relative_Offset = 0
else:
relative_Offset += 1
self.__fptr.seek(tempfp) # if tempfp is too big, this sentence will become a bug.
return val\_labels, val\_data
def getValData(self, testNo, valNo, train_group_size=0):
return self.__getValData(testNo, valNo, train_group_size)
def __getTestData(self, testNo, valNo, train_group_size=0):
tempfp = self.__fptr.tell() # will overflow if tempfp is too big.
if (0 == self.all_count):
self.getDataNum(testNo, valNo, train\_group\_size)
test\_data = np.empty(\[self.test\_count, self.\_\_rows, self.\_\_columns,
self.__heights, 1\], dtype=float)
test\_labels = np.empty(self.test\_count, dtype=int)
testi = 0
self.__fptr.seek(0)
relative_Offset = 0
for i in range(self.all_count):
line = self.\_\_csv\_lines\[i\]
if line\[1\] == testNo:
if (relative_Offset > 0):
self.\_\_fptr.seek(relative\_Offset * self.\_\_t\_length, 1)
bufCIFAR = self.\_\_fptr.read(self.\_\_t_length)
test_labels\[testi\] = np.frombuffer(
bufCIFAR\[0\], dtype=np.uint8).astype(np.int64)
test_data\[testi, ...\] = (np.frombuffer(
bufCIFAR\[1:\], dtype=np.float32)).reshape(self.\_\_rows, self.\_\_columns, self.__heights, 1)
testi += 1
relative_Offset = 0
else:
relative_Offset += 1
self.__fptr.seek(tempfp) # if tempfp is too big, this sentence will become a bug.
return test\_labels, test\_data
def getTestData(self, testNo, valNo, train_group_size=0):
return self.__getTestData(testNo, valNo, train_group_size)
def __getTrainData(self, testNo, valNo, num=[0]):
if self.all_count == 0:
self.getDataNum(testNo, valNo)
if self.\_\_num\_of\_read == self.all\_count:
return 0, 0
if self.\_\_train\_rec_index == 0:
self.__fptr.seek(0)
if (num\[0\] == 0 or self.\_\_train\_rec\_index + num\[0\] > self.train\_count):
num\[0\] = self.train\_count - self.\_\_train\_rec\_index
if num\[0\] == 0: # which means no more train data
train_data = 0
train_labels = 0
else:
train\_data = np.empty(\[num\[0\], self.\_\_rows, self.__columns,
self.__heights, 1\], dtype=float)
train_labels = np.empty(num\[0\], dtype=int)
traini = 0
relative_Offset = 0
for i in range(self.all\_count - long(self.\_\_num\_of\_read)):
line = self.\_\_csv\_lines\[self.\_\_num\_of_read + i\]
if line\[1\] == testNo:
self.\_\_test\_rec_index += 1
relative_Offset = 0
elif line\[1\] == valNo:
self.\_\_val\_rec_index += 1
relative_Offset += 1
else:
if (traini == num\[0\]):
if (relative_Offset > 0):
self.\_\_fptr.seek(relative\_Offset * self.\_\_t\_length, 1)
break
if (relative_Offset > 0):
self.\_\_fptr.seek(relative\_Offset * self.\_\_t\_length, 1)
bufCIFAR = self.\_\_fptr.read(self.\_\_t_length)
train_labels\[traini\] = np.frombuffer(
bufCIFAR\[0\], dtype=np.uint8).astype(np.int64)
train_data\[traini, ...\] = (np.frombuffer(
bufCIFAR\[1:\], dtype=np.float32)).reshape(self.\_\_rows, self.\_\_columns, self.__heights, 1)
traini += 1
self.\_\_train\_rec_index += 1
relative_Offset = 0
self.\_\_num\_of\_read = self.\_\_train\_rec\_index + self.\_\_test\_rec\_index + self.\_\_val\_rec\_index
return train\_data, train\_labels
def getTrainData(self, testNo, valNo, num=[0]):
return self.__getTrainData(testNo, valNo, num)
Wb = {
‘W1’: tf.Variable(tf.truncated_normal([3, 3, 3, NUM_CHANNELS, 16], stddev=0.1, seed=SEED)),
‘b1’: tf.Variable(tf.zeros([16])),
‘W2’: tf.Variable(tf.truncated_normal([3, 3, 3, 16, 32], stddev=0.1, seed=SEED)),
‘b2’: tf.Variable(tf.zeros([32])),
‘W3’: tf.Variable(tf.truncated_normal([3, 3, 3, 32, 48], stddev=0.1, seed=SEED)),
‘b3’: tf.Variable(tf.zeros([48])),
‘W4’: tf.Variable(tf.truncated_normal([3, 3, 3, 48, 64], stddev=0.1, seed=SEED)),
‘b4’: tf.Variable(tf.zeros([64])),
‘fcw1’: tf.Variable(tf.truncated_normal([2**3 * 64, 32], stddev=0.1, seed=SEED)),
‘fcb1’: tf.Variable(tf.zeros([32])),
‘fcw2’: tf.Variable(tf.truncated_normal([32, NUM_LABELS], stddev=0.1, seed=SEED)),
‘fcb2’: tf.Variable(tf.zeros([NUM_LABELS]))
}
def model(data, keep_prob):
with tf.variable_scope(‘conv1’):
conv = tf.nn.conv3d(data, Wb[‘W1’], strides=[1, 1, 1, 1, 1], padding=’SAME’)
relu = tf.nn.relu(tf.nn.bias_add(conv, Wb[‘b1’]))
pool = tf.nn.max_pool3d(relu, ksize=[1, 2, 2, 2, 1],
strides=[1, 2, 2, 2, 1], padding=’VALID’)
with tf.variable_scope(‘conv2’):
conv = tf.nn.conv3d(pool, Wb[‘W2’], strides=[1, 1, 1, 1, 1], padding=’SAME’)
relu = tf.nn.relu(tf.nn.bias_add(conv, Wb[‘b2’]))
pool = tf.nn.max_pool3d(relu, ksize=[1, 2, 2, 2, 1],
strides=[1, 2, 2, 2, 1], padding=’VALID’)
with tf.variable_scope(‘conv3’):
conv = tf.nn.conv3d(pool, Wb[‘W3’], strides=[1, 1, 1, 1, 1], padding=’SAME’)
relu = tf.nn.relu(tf.nn.bias_add(conv, Wb[‘b3’]))
pool = tf.nn.max_pool3d(relu, ksize=[1, 2, 2, 2, 1],
strides=[1, 2, 2, 2, 1], padding=’VALID’)
with tf.variable_scope(‘conv4’):
conv = tf.nn.conv3d(pool, Wb[‘W4’], strides=[1, 1, 1, 1, 1], padding=’SAME’)
relu = tf.nn.relu(tf.nn.bias_add(conv, Wb[‘b4’]))
pool = tf.nn.max_pool3d(relu, ksize=[1, 2, 2, 2, 1],
strides=[1, 2, 2, 2, 1], padding=’VALID’)
with tf.variable_scope(‘reshape’):
ps = pool.get_shape().as_list()
reshape = tf.reshape(pool, [-1, ps[1] ps[2] ps[3] * ps[4]])
with tf.variable_scope(‘fc1’):
hidden = tf.nn.relu(tf.matmul(reshape, Wb[‘fcw1’]) + Wb[‘fcb1’])
with tf.variable_scope(‘dropout’):
hidden = tf.nn.dropout(hidden, keep_prob, seed=SEED)
with tf.variable_scope(‘fc2’):
out = tf.matmul(hidden, Wb[‘fcw2’]) + Wb[‘fcb2’]
return out
def eval_in_batches(data, sess, eval_prediction, eval_data):
size = data.shape[0]
if size < EVAL_BATCH_SIZE:
raise ValueError(“batch size for evals larger than dataset: %d” % size)
predictions = np.ndarray(shape=(size, NUM_LABELS), dtype=np.float32)
for begin in xrange(0, size, EVAL_BATCH_SIZE):
end = begin + EVAL_BATCH_SIZE
if end <= size:
predictions[begin:end, :] = sess.run(eval_prediction, feed_dict={
eval_data: data[begin:end, …]})
else:
batch_predictions = sess.run(eval_prediction, feed_dict={
eval_data: data[-EVAL_BATCH_SIZE:, …]})
predictions[begin:, :] = batch_predictions[begin - size:, :]
return predictions
def error_rate(predictions, labels):
“””Return the error rate based on dense predictions and sparse labels.”””
return 100.0 - (100.0 * np.sum(np.argmax(predictions, 1) == labels) /
predictions.shape[0])
def readCSV(filename):
‘’’read lines from a csv file.
‘’’
lines = []
with open(filename, “rb”) as f:
csvreader = csv.reader(f)
for line in csvreader:
lines.append(line)
return lines
def lunaTrain(VIEW_DIRECTORY, imgName, csvName):
for cross in range(2):
sssstttt = time.time()
print(‘Cross {}…’.format(cross))
WORK_DIRECTORY = os.path.join(VIEW_DIRECTORY, ‘Cross{}’.format(cross))
testNo = str(cross)
valNo = str((cross + 1) % 10)
st = time.time()
view_dataset = lunaDataSet(imgName, csvName, is_parallel=False)
num_group_batches = 300
train_group_size = [num_group_batches * BATCH_SIZE]
val_labels, val_data = view_dataset.getValData(testNo, valNo, train_group_size[0])
test_labels, test_data = view_dataset.getTestData(testNo, valNo, train_group_size[0])
print(‘Reading validation and test data costs {:.2f} seconds…’.format(
time.time() - st))
num\_epochs = NUM\_EPOCHS
train\_size = view\_dataset.train_count
group\_count = int(math.ceil(float(train\_size) / train\_group\_size\[0\]))
train\_data\_node = tf.placeholder(tf.float32, shape=(
BATCH\_SIZE, IMAGE\_SIZE, IMAGE\_SIZE, IMAGE\_SIZE, NUM_CHANNELS))
train\_labels\_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
eval_data = tf.placeholder(tf.float32, shape=(
EVAL\_BATCH\_SIZE, IMAGE\_SIZE, IMAGE\_SIZE, IMAGE\_SIZE, NUM\_CHANNELS))
keep_hidden = tf.placeholder(tf.float32)
logits = model(train\_data\_node, keep_hidden)
loss = tf.reduce\_mean(tf.nn.sparse\_softmax\_cross\_entropy\_with\_logits(
logits, train\_labels\_node))
batch = tf.Variable(0)
learning\_rate = tf.train.exponential\_decay(0.01, batch * BATCH\_SIZE, train\_size / 10,
0.95, staircase=True)
optimizer = tf.train.MomentumOptimizer(
learning\_rate, 0.9).minimize(loss, global\_step=batch)
train_prediction = tf.nn.softmax(logits)
eval\_prediction = tf.nn.softmax(model(eval\_data, 1))
start_time = time.time()
saver = tf.train.Saver()
with tf.Session() as sess:
tf.initialize\_all\_variables().run()
print('Initialized!')
st = time.time()
summary\_writer = tf.train.SummaryWriter(WORK\_DIRECTORY, sess.graph)
accumulate_count = 0
TRAIN\_FREQUENCY = train\_size // BATCH_SIZE // 10
VAL\_FREQUENCY = train\_size // BATCH_SIZE
TEST\_FREQUENCY = VAL\_FREQUENCY * 2
for groupstep in xrange(group_count):
train\_data, train\_labels = view_dataset.getTrainData(
testNo, valNo, train\_group\_size)
if (train_data is 0):
continue
for step in xrange(int(math.ceil(train\_group\_size\[0\] * 1. / BATCH\_SIZE) * num\_epochs)):
accumulate_count += 1
offset = step * \
BATCH\_SIZE % (train\_group\_size\[0\] - train\_group\_size\[0\] % BATCH\_SIZE)
batch\_data = train\_data\[offset:(offset + BATCH_SIZE), ...\]
if (batch\_data.shape\[0\] != BATCH\_SIZE):
continue
batch\_labels = train\_labels\[offset:(offset + BATCH_SIZE)\]
feed\_dict = {train\_data\_node: batch\_data,
train\_labels\_node: batch\_labels, keep\_hidden: 0.5}
_, l, lr, predictions = sess.run(\[optimizer, loss, learning\_rate, train\_prediction\],
feed\_dict=feed\_dict)
if accumulate\_count != 0 and accumulate\_count % TRAIN_FREQUENCY == 0:
elapsed\_time = time.time() - start\_time
start_time = time.time()
print('Step %d (epoch %.2f), %.1f ms' %
(accumulate\_count, float(accumulate\_count) * BATCH\_SIZE / train\_size, 1000 * elapsed\_time / TRAIN\_FREQUENCY))
print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
trE = error\_rate(predictions, batch\_labels)
print('Minibatch error: %.3f%%' % trE)
if accumulate\_count != 0 and VAL\_FREQUENCY != 0 and accumulate\_count % VAL\_FREQUENCY == 0:
valE = error\_rate(eval\_in_batches(
val\_data, sess, eval\_prediction, eval\_data), val\_labels)
print('Validation error: %.3f%%' % valE)
valPE = error\_rate(eval\_in_batches(
val\_data\[view\_dataset.val\_no\_aug\_pos, ...\], sess, eval\_prediction, eval\_data), val\_labels\[view\_dataset.val\_no\_aug\_pos\])
print('Validation error of no aug Positive: %.3f%%' % valPE)
if accumulate\_count != 0 and TEST\_FREQUENCY != 0 and accumulate\_count % TEST\_FREQUENCY == 0:
test\_error = error\_rate(eval\_in\_batches(
test\_data, sess, eval\_prediction, eval\_data), test\_labels)
print('Test error: %.3f%%' % test_error)
test\_errorP = error\_rate(eval\_in\_batches(
test\_data\[view\_dataset.test\_no\_aug\_pos, ...\], sess, eval\_prediction, eval\_data), test\_labels\[view\_dataset.test\_no\_aug\_pos\])
print('Test error of no aug Positive: %.3f%%' % test_errorP)
else:
checkpoint\_path = os.path.join(WORK\_DIRECTORY, 'model.ckpt')
saver.save(sess, checkpoint\_path, global\_step=accumulate_count)
print('Train finished in {:.2f} seconds...'.format(time.time() - st))
preds = eval\_in\_batches(test\_data, sess, eval\_prediction, eval_data)
test\_error = error\_rate(preds, test_labels)
with open(os.path.join(WORK_DIRECTORY, 'result.txt'), 'w') as f:
np.savetxt(f, np.c_\[preds, test_labels\])
print('Test error: %.3f%%' % test_error)
test\_errorP = error\_rate(eval\_in\_batches(
test\_data\[view\_dataset.test\_no\_aug\_pos, ...\], sess, eval\_prediction, eval\_data), test\_labels\[view\_dataset.test\_no\_aug\_pos\])
print('Test error of no aug Positive: %.3f%%' % test_errorP)
print('All costs {:.2f} seconds...'.format(time.time() - sssstttt))
train\_data = val\_data = test_data = 0
train\_labels = val\_labels = test_labels = 0
def main():
viewPath = ‘/home/kong/400G/new_detect’
csvName = ‘/home/kong/400G/new_detect/Shuffle.csv’
imgName = ‘/home/kong/400G/new_detect/shuffle3D.bin’
lunaTrain(viewPath, imgName, csvName)
if __name__ == ‘__main__‘:
main()