#!/usr/bin/env python3
Logistic Regression
logistic regression은 아래 1차 함수를 sigmoid와 함성합니다.
y = sigmoid(Ax + b)
birth weight data를 웹상에서 불러오겠습니다.
다음은 웹(github)에서 데이터를 불러오는 과정입니다.
현재 디렉토리에 'birth_weight.csv'가 없을 때 웹상에서 파일을 불러오는 코드는 다음과 같습니다.
import requests
import os.path
birth_weight_file = 'birth_weight.csv'
if not os.path.exists(birth_weight_file):
url='https://raw.githubusercontent.com/nfmcclure/tensorflow_cookbook/master/01_Introduction/07_Working_with_Data_Sources/birthweight_data/birthweight.dat'
load_file = requests.get(url)
data = load_file.text.split('\r\n')
header = data[0].split('\t')
raw_birth_data = [[float(args) for args in row.split('\t') if len(args) >= 1] for row in data[1:] if len(row)>=1]
length = len(raw_birth_data[0])
birth_data = np.empty([0, length])
for birth in raw_birth_data:
birth_data = np.vstack([raw_birth_data, birth])
print(header)
# ['LOW', 'AGE', 'LWT', 'RACE', 'SMOKE', 'PTL', 'HT', 'UI', 'BWT']
print(birth_data)
# [[1.000e+00 2.800e+01 1.130e+02 ... 0.000e+00 1.000e+00 7.090e+02]
# [1.000e+00 2.900e+01 1.300e+02 ... 0.000e+00 1.000e+00 1.021e+03]
# [1.000e+00 3.400e+01 1.870e+02 ... 1.000e+00 0.000e+00 1.135e+03]
# ...
# [0.000e+00 2.400e+01 2.160e+02 ... 0.000e+00 0.000e+00 4.593e+03]
# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]
# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]]
이제 이 파일을 다음 코드를 이용하여 만들어보겠습니다.
import csv
with open(birth_weight_file, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(birth_data)
f.close()
이제 birth_weight_file.csv 가 현재 디렉토리에 생성되었습니다.
이 파일을 다시 불러오겠습니다.
import numpy as np
birth_data = []
with open(birth_weight_file, newline='') as csvfile:
csv_reader = csv.reader(csvfile)
birth_header = next(csv_reader)
birth_data = [[float(args) for args in row] for row in csv_reader]
birth_data = np.array(birth_data)
print(birth_data)
# [[1.000e+00 2.800e+01 1.130e+02 ... 0.000e+00 1.000e+00 7.090e+02]
# [1.000e+00 2.900e+01 1.300e+02 ... 0.000e+00 1.000e+00 1.021e+03]
# [1.000e+00 3.400e+01 1.870e+02 ... 1.000e+00 0.000e+00 1.135e+03]
# ...
# [0.000e+00 2.400e+01 2.160e+02 ... 0.000e+00 0.000e+00 4.593e+03]
# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]
# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]]
pandas로도 csv파일을 불러올 수 있습니다.
import pandas as pd
birth_data = []
birth_data_pd = pd.read_csv(birth_weight_file)
birth_data = birth_data_pd.values
print(birth_data)
# [[1.000e+00 2.800e+01 1.130e+02 ... 0.000e+00 1.000e+00 7.090e+02]
# [1.000e+00 2.900e+01 1.300e+02 ... 0.000e+00 1.000e+00 1.021e+03]
# [1.000e+00 3.400e+01 1.870e+02 ... 1.000e+00 0.000e+00 1.135e+03]
# ...
# [0.000e+00 2.400e+01 2.160e+02 ... 0.000e+00 0.000e+00 4.593e+03]
# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]
# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]]
이제 데이터를 불러오는 과정이 끝났으니 tensorflow로 Logistic Regression을 적용해보겠습니다.
Losistic Regression을 적용하기 전에 algorithm을 살펴보겠습니다.
Logistic Regression의 흐름
우선 데이터셋에서 변수를 추출하겠습니다.
birth_m_data = birth_data[:,1:8]
print(birth_m_data)
# [[ 28. 113. 1. ... 1. 0. 1.]
# [ 29. 130. 0. ... 0. 0. 1.]
# [ 34. 187. 1. ... 0. 1. 0.]
# ...
# [ 24. 216. 0. ... 0. 0. 0.]
# [ 45. 123. 0. ... 1. 0. 0.]
# [ 45. 123. 0. ... 1. 0. 0.]]
birth_target = birth_data[:, 0]
print(birth_target)
# [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
# 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
# 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
scikit-learn의 train_test_split함수를 사용해도 되지만 파이썬 코드를 사용해서 훈련데이터와 학습데이터를 8:2로 분할하겠습니다.
# Split data into train/test = 80%/20%
data_size = len(birth_m_data)
train_idx = np.random.choice(data_size, round(data_size*0.8), replace=False)
temp_test_idx = set(np.arange(data_size)) - set(train_idx)
test_idx = np.array(list(temp_test_idx))
x_train = birth_m_data[train_idx]
x_test = birth_m_data[train_idx]
y_train = birth_target[test_idx]
y_test = birth_target[test_idx]
print(x_train)
# [[ 20. 105. 1. ... 0. 0. 0.]
# [ 31. 100. 0. ... 0. 0. 1.]
# [ 32. 132. 0. ... 0. 0. 0.]
# ...
# [ 20. 120. 1. ... 0. 0. 0.]
# [ 20. 150. 0. ... 0. 0. 0.]
# [ 16. 110. 1. ... 0. 0. 0.]]
print(x_test)
# [[ 20. 105. 1. ... 0. 0. 0.]
# [ 31. 100. 0. ... 0. 0. 1.]
# [ 32. 132. 0. ... 0. 0. 0.]
# ...
# [ 20. 120. 1. ... 0. 0. 0.]
# [ 20. 150. 0. ... 0. 0. 0.]
# [ 16. 110. 1. ... 0. 0. 0.]]
데이터셋 변수를 최솟값 0, 최댓값 1이 되도록 스케일링 해보겠습니다. 사용자 정의 함수를 이용하여 직접 코드를 짤 수도 있고,
scikit-learn의 MinMaxScaler 메소드로도 할 수 있습니다.
2가지 모두 해보겠습니다.
# user define function
def zero_to_one(m):
col_max = np.max(x_train, axis=0)
col_min = np.min(x_train, axis=0)
return (m-col_min)/(col_max-col_min)
x_train_scaled_def = np.nan_to_num(zero_to_one(x_train))
print(x_train_scaled_def)
# [[0.19354839 0.14705882 1. ... 0. 0. 0. ]
# [0.5483871 0.11764706 0. ... 0. 0. 1. ]
# [0.58064516 0.30588235 0. ... 0. 0. 0. ]
# ...
# [0.19354839 0.23529412 1. ... 0. 0. 0. ]
# [0.19354839 0.41176471 0. ... 0. 0. 0. ]
# [0.06451613 0.17647059 1. ... 0. 0. 0. ]]
x_test_scaled_def = np.nan_to_num(zero_to_one(x_test))
print(x_test_scaled_def)
# [[0.19354839 0.14705882 1. ... 0. 0. 0. ]
# [0.5483871 0.11764706 0. ... 0. 0. 1. ]
# [0.58064516 0.30588235 0. ... 0. 0. 0. ]
# ...
# [0.19354839 0.23529412 1. ... 0. 0. 0. ]
# [0.19354839 0.41176471 0. ... 0. 0. 0. ]
# [0.06451613 0.17647059 1. ... 0. 0. 0. ]]
# scikit-learn
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
print(x_train_scaled)
# [[0.19354839 0.14705882 1. ... 0. 0. 0. ]
# [0.5483871 0.11764706 0. ... 0. 0. 1. ]
# [0.58064516 0.30588235 0. ... 0. 0. 0. ]
# ...
# [0.19354839 0.23529412 1. ... 0. 0. 0. ]
# [0.19354839 0.41176471 0. ... 0. 0. 0. ]
# [0.06451613 0.17647059 1. ... 0. 0. 0. ]]
x_test_scaled = scaler.transform(x_test)
print(x_test_scaled)
# [[0.19354839 0.14705882 1. ... 0. 0. 0. ]
# [0.5483871 0.11764706 0. ... 0. 0. 1. ]
# [0.58064516 0.30588235 0. ... 0. 0. 0. ]
# ...
# [0.19354839 0.23529412 1. ... 0. 0. 0. ]
# [0.19354839 0.41176471 0. ... 0. 0. 0. ]
# [0.06451613 0.17647059 1. ... 0. 0. 0. ]]
이제 데이터로딩과 전처리가 완료 되었으니 tensorflow로 logistic 분석을 해보겠습니다.
먼저 tensorflow로 학습 algorithm tool을 만듭니다.
import tensorflow as tf
from tensorflow.python.framework import ops
import matplotlib.pyplot as plt
ops.reset_default_graph()
# Create graph
sess = tf.Session()
# Set for reproducible results
np.random.seed(seed=99)
tf.set_random_seed(seed=99)
# Initialize placeholders
x_data = tf.placeholder(shape=[None, 7], dtype=tf.float32)
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)
# Create variables for linear regression
A = tf.Variable(tf.random_normal(shape=[7,1]))
b = tf.Variable(tf.random_normal(shape=[1,1]))
# Declare model operations
fomula = tf.add(tf.matmul(x_data, A), b)
# Declare loss function (Cross Entropy loss)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=fomula, labels=y_target))
# Declare optimizeraon
opt = tf.train.GradientDescentOptimizer(0.01)
train_step = opt.minimize(loss)
# Initialize variables
init = tf.global_variables_initializer()
sess.run(init)
# Actual Prediction
prediction = tf.round(tf.sigmoid(fomula))
predictions_correct = tf.cast(tf.equal(prediction, y_target), tf.float32) # tf.cast: 자료형태를 변환합니다. tf.cast(input tensor, dtype)
accuracy = tf.reduce_mean(predictions_correct)
이제 algorithm tool을 만들었으니 이를 바탕으로 학습을 시켜보겠습니다.
# Training loop
batch_size = 25
loss_vec = []
train_acc = []
test_acc = []
for i in range(1500):
rand_idx = np.random.choice(len(x_train_scaled), size=batch_size)
rand_x = x_train_scaled[rand_idx]
rand_y = y_train[rand_idx].reshape(-1, 1) # reshape에 주의
sess.run(train_step, feed_dict={x_data:rand_x, y_target:rand_y})
temp_loss = sess.run(loss, feed_dict={x_data:rand_x, y_target:rand_y}) # train step을 실행
loss_vec.append(temp_loss)
temp_train = sess.run(accuracy, feed_dict={x_data:x_train_scaled, y_target:y_train.reshape(-1, 1)}) # accuracy를 실행, reshape 주의
train_acc.append(temp_train)
temp_test = sess.run(accuracy, feed_dict={x_data:x_test_scaled, y_target:y_test.reshape(-1, 1)}) # accuracy를 실행, reshape 주의
test_acc.append(temp_test)
if (i+1)%300==0:
print('Loss = {}'.format(temp_loss))
# Loss: 0.8982473015785217
# Loss: 0.641773521900177
# Loss: 0.630251944065094
# Loss: 0.5026944279670715
# Loss: 0.5662710070610046
# Loss: 0.6175627708435059
# Loss: 0.5372310876846313
sess.close()
위의 결과를 바탕으로 시각화를 하는 코드는 다음과 같습니다.
_, axe = plt.subplots(1, 2)
# Plot loss over time
axe[0].plot(loss_vec, 'k-')
axe[0].set_title('Cross Entropy Loss per Generation')
axe[0].set_xlabel('Generation')
axe[0].set_ylabel('Cross Entropy Loss')
# Plot train and test accuracy
axe[1].plot(train_acc, 'k-', label='Train Set Accuracy')
axe[1].plot(test_acc, 'r--', label='Test Set Accuracy')
axe[1].set_title('Train and Test Accuracy')
axe[1].set_xlabel('Generation')
axe[1].set_ylabel('Accuracy')
axe[1].legend(loc='lower right')
plt.show()
Loss 함수와 Accuracy
참고 자료:
[1]TensorFlow Machine Learning Cookbook, Nick McClure
[2]https://github.com/nfmcclure/tensorflow_cookbook