Logistic Regression

2018. 5. 1. 01:54

#!/usr/bin/env python3

Logistic Regression

logistic regression은 아래 1차 함수를 sigmoid와 함성합니다.

y = sigmoid(Ax + b)

birth weight data를 웹상에서 불러오겠습니다.

다음은 웹(github)에서 데이터를 불러오는 과정입니다.

현재 디렉토리에 'birth_weight.csv'가 없을 때 웹상에서 파일을 불러오는 코드는 다음과 같습니다.

import requests

import os.path

birth_weight_file = 'birth_weight.csv'

if not os.path.exists(birth_weight_file):

url='https://raw.githubusercontent.com/nfmcclure/tensorflow_cookbook/master/01_Introduction/07_Working_with_Data_Sources/birthweight_data/birthweight.dat'

load_file = requests.get(url)

data = load_file.text.split('\r\n')

header = data[0].split('\t')

raw_birth_data = [[float(args) for args in row.split('\t') if len(args) >= 1] for row in data[1:] if len(row)>=1]

length = len(raw_birth_data[0])

birth_data = np.empty([0, length])

for birth in raw_birth_data:

birth_data = np.vstack([raw_birth_data, birth])

print(header)

# ['LOW', 'AGE', 'LWT', 'RACE', 'SMOKE', 'PTL', 'HT', 'UI', 'BWT']

print(birth_data)

# [[1.000e+00 2.800e+01 1.130e+02 ... 0.000e+00 1.000e+00 7.090e+02]

# [1.000e+00 2.900e+01 1.300e+02 ... 0.000e+00 1.000e+00 1.021e+03]

# [1.000e+00 3.400e+01 1.870e+02 ... 1.000e+00 0.000e+00 1.135e+03]

# ...

# [0.000e+00 2.400e+01 2.160e+02 ... 0.000e+00 0.000e+00 4.593e+03]

# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]

# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]]

이제 이 파일을 다음 코드를 이용하여 만들어보겠습니다.

import csv

with open(birth_weight_file, 'w', newline='') as f:

writer = csv.writer(f)

writer.writerow(header)

writer.writerows(birth_data)

f.close()

이제 birth_weight_file.csv 가 현재 디렉토리에 생성되었습니다.

이 파일을 다시 불러오겠습니다.

import numpy as np

birth_data = []

with open(birth_weight_file, newline='') as csvfile:

csv_reader = csv.reader(csvfile)

birth_header = next(csv_reader)

birth_data = [[float(args) for args in row] for row in csv_reader]

birth_data = np.array(birth_data)

print(birth_data)

# [[1.000e+00 2.800e+01 1.130e+02 ... 0.000e+00 1.000e+00 7.090e+02]

# [1.000e+00 2.900e+01 1.300e+02 ... 0.000e+00 1.000e+00 1.021e+03]

# [1.000e+00 3.400e+01 1.870e+02 ... 1.000e+00 0.000e+00 1.135e+03]

# ...

# [0.000e+00 2.400e+01 2.160e+02 ... 0.000e+00 0.000e+00 4.593e+03]

# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]

# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]]

pandas로도 csv파일을 불러올 수 있습니다.

import pandas as pd

birth_data = []

birth_data_pd = pd.read_csv(birth_weight_file)

birth_data = birth_data_pd.values

print(birth_data)

# [[1.000e+00 2.800e+01 1.130e+02 ... 0.000e+00 1.000e+00 7.090e+02]

# [1.000e+00 2.900e+01 1.300e+02 ... 0.000e+00 1.000e+00 1.021e+03]

# [1.000e+00 3.400e+01 1.870e+02 ... 1.000e+00 0.000e+00 1.135e+03]

# ...

# [0.000e+00 2.400e+01 2.160e+02 ... 0.000e+00 0.000e+00 4.593e+03]

# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]

# [0.000e+00 4.500e+01 1.230e+02 ... 0.000e+00 0.000e+00 4.990e+03]]

이제 데이터를 불러오는 과정이 끝났으니 tensorflow로 Logistic Regression을 적용해보겠습니다.

Losistic Regression을 적용하기 전에 algorithm을 살펴보겠습니다.

Logistic Regression의 흐름

우선 데이터셋에서 변수를 추출하겠습니다.

birth_m_data = birth_data[:,1:8]

print(birth_m_data)

# [[ 28. 113. 1. ... 1. 0. 1.]

# [ 29. 130. 0. ... 0. 0. 1.]

# [ 34. 187. 1. ... 0. 1. 0.]

# ...

# [ 24. 216. 0. ... 0. 0. 0.]

# [ 45. 123. 0. ... 1. 0. 0.]

# [ 45. 123. 0. ... 1. 0. 0.]]

birth_target = birth_data[:, 0]

print(birth_target)

# [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.

# 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.

# 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

scikit-learn의 train_test_split함수를 사용해도 되지만 파이썬 코드를 사용해서 훈련데이터와 학습데이터를 8:2로 분할하겠습니다.

# Split data into train/test = 80%/20%

data_size = len(birth_m_data)

train_idx = np.random.choice(data_size, round(data_size*0.8), replace=False)

temp_test_idx = set(np.arange(data_size)) - set(train_idx)

test_idx = np.array(list(temp_test_idx))

x_train = birth_m_data[train_idx]

x_test = birth_m_data[train_idx]

y_train = birth_target[test_idx]

y_test = birth_target[test_idx]

print(x_train)

# [[ 20. 105. 1. ... 0. 0. 0.]

# [ 31. 100. 0. ... 0. 0. 1.]

# [ 32. 132. 0. ... 0. 0. 0.]

# ...

# [ 20. 120. 1. ... 0. 0. 0.]

# [ 20. 150. 0. ... 0. 0. 0.]

# [ 16. 110. 1. ... 0. 0. 0.]]

print(x_test)

# [[ 20. 105. 1. ... 0. 0. 0.]

# [ 31. 100. 0. ... 0. 0. 1.]

# [ 32. 132. 0. ... 0. 0. 0.]

# ...

# [ 20. 120. 1. ... 0. 0. 0.]

# [ 20. 150. 0. ... 0. 0. 0.]

# [ 16. 110. 1. ... 0. 0. 0.]]

데이터셋 변수를 최솟값 0, 최댓값 1이 되도록 스케일링 해보겠습니다. 사용자 정의 함수를 이용하여 직접 코드를 짤 수도 있고,

scikit-learn의 MinMaxScaler 메소드로도 할 수 있습니다.

2가지 모두 해보겠습니다.

# user define function

def zero_to_one(m):

col_max = np.max(x_train, axis=0)

col_min = np.min(x_train, axis=0)

return (m-col_min)/(col_max-col_min)

x_train_scaled_def = np.nan_to_num(zero_to_one(x_train))

print(x_train_scaled_def)

# [[0.19354839 0.14705882 1. ... 0. 0. 0. ]

# [0.5483871 0.11764706 0. ... 0. 0. 1. ]

# [0.58064516 0.30588235 0. ... 0. 0. 0. ]

# ...

# [0.19354839 0.23529412 1. ... 0. 0. 0. ]

# [0.19354839 0.41176471 0. ... 0. 0. 0. ]

# [0.06451613 0.17647059 1. ... 0. 0. 0. ]]

x_test_scaled_def = np.nan_to_num(zero_to_one(x_test))

print(x_test_scaled_def)

# [[0.19354839 0.14705882 1. ... 0. 0. 0. ]

# [0.5483871 0.11764706 0. ... 0. 0. 1. ]

# [0.58064516 0.30588235 0. ... 0. 0. 0. ]

# ...

# [0.19354839 0.23529412 1. ... 0. 0. 0. ]

# [0.19354839 0.41176471 0. ... 0. 0. 0. ]

# [0.06451613 0.17647059 1. ... 0. 0. 0. ]]

# scikit-learn

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)

print(x_train_scaled)

# [[0.19354839 0.14705882 1. ... 0. 0. 0. ]

# [0.5483871 0.11764706 0. ... 0. 0. 1. ]

# [0.58064516 0.30588235 0. ... 0. 0. 0. ]

# ...

# [0.19354839 0.23529412 1. ... 0. 0. 0. ]

# [0.19354839 0.41176471 0. ... 0. 0. 0. ]

# [0.06451613 0.17647059 1. ... 0. 0. 0. ]]

x_test_scaled = scaler.transform(x_test)

print(x_test_scaled)

# [[0.19354839 0.14705882 1. ... 0. 0. 0. ]

# [0.5483871 0.11764706 0. ... 0. 0. 1. ]

# [0.58064516 0.30588235 0. ... 0. 0. 0. ]

# ...

# [0.19354839 0.23529412 1. ... 0. 0. 0. ]

# [0.19354839 0.41176471 0. ... 0. 0. 0. ]

# [0.06451613 0.17647059 1. ... 0. 0. 0. ]]

이제 데이터로딩과 전처리가 완료 되었으니 tensorflow로 logistic 분석을 해보겠습니다.

먼저 tensorflow로 학습 algorithm tool을 만듭니다.

import tensorflow as tf

from tensorflow.python.framework import ops

import matplotlib.pyplot as plt

ops.reset_default_graph()

# Create graph

sess = tf.Session()

# Set for reproducible results

np.random.seed(seed=99)

tf.set_random_seed(seed=99)

# Initialize placeholders

x_data = tf.placeholder(shape=[None, 7], dtype=tf.float32)

y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)

# Create variables for linear regression

A = tf.Variable(tf.random_normal(shape=[7,1]))

b = tf.Variable(tf.random_normal(shape=[1,1]))

# Declare model operations

fomula = tf.add(tf.matmul(x_data, A), b)

# Declare loss function (Cross Entropy loss)

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=fomula, labels=y_target))

# Declare optimizeraon

opt = tf.train.GradientDescentOptimizer(0.01)

train_step = opt.minimize(loss)

# Initialize variables

init = tf.global_variables_initializer()

sess.run(init)

# Actual Prediction

prediction = tf.round(tf.sigmoid(fomula))

predictions_correct = tf.cast(tf.equal(prediction, y_target), tf.float32) # tf.cast: 자료형태를 변환합니다. tf.cast(input tensor, dtype)

accuracy = tf.reduce_mean(predictions_correct)

이제 algorithm tool을 만들었으니 이를 바탕으로 학습을 시켜보겠습니다.

# Training loop

batch_size = 25

loss_vec = []

train_acc = []

test_acc = []

for i in range(1500):

rand_idx = np.random.choice(len(x_train_scaled), size=batch_size)

rand_x = x_train_scaled[rand_idx]

rand_y = y_train[rand_idx].reshape(-1, 1) # reshape에 주의

sess.run(train_step, feed_dict={x_data:rand_x, y_target:rand_y})

temp_loss = sess.run(loss, feed_dict={x_data:rand_x, y_target:rand_y}) # train step을 실행

loss_vec.append(temp_loss)

temp_train = sess.run(accuracy, feed_dict={x_data:x_train_scaled, y_target:y_train.reshape(-1, 1)}) # accuracy를 실행, reshape 주의

train_acc.append(temp_train)

temp_test = sess.run(accuracy, feed_dict={x_data:x_test_scaled, y_target:y_test.reshape(-1, 1)}) # accuracy를 실행, reshape 주의

test_acc.append(temp_test)

if (i+1)%300==0:

print('Loss = {}'.format(temp_loss))

# Loss: 0.8982473015785217

# Loss: 0.641773521900177

# Loss: 0.630251944065094

# Loss: 0.5026944279670715

# Loss: 0.5662710070610046

# Loss: 0.6175627708435059

# Loss: 0.5372310876846313

sess.close()

위의 결과를 바탕으로 시각화를 하는 코드는 다음과 같습니다.

_, axe = plt.subplots(1, 2)

# Plot loss over time

axe[0].plot(loss_vec, 'k-')

axe[0].set_title('Cross Entropy Loss per Generation')

axe[0].set_xlabel('Generation')

axe[0].set_ylabel('Cross Entropy Loss')

# Plot train and test accuracy

axe[1].plot(train_acc, 'k-', label='Train Set Accuracy')

axe[1].plot(test_acc, 'r--', label='Test Set Accuracy')

axe[1].set_title('Train and Test Accuracy')

axe[1].set_xlabel('Generation')

axe[1].set_ylabel('Accuracy')

axe[1].legend(loc='lower right')

plt.show()

Loss 함수와 Accuracy

참고 자료:

[1]TensorFlow Machine Learning Cookbook, Nick McClure

[2]https://github.com/nfmcclure/tensorflow_cookbook

'Tensorflow > Linear Regression' 카테고리의 다른 글

Elastic Net Regression (0)	2018.04.29
LASSO and Ridge Regression (1)	2018.04.27
Deming Regression (0)	2018.04.27
Loss Function in Linear Regressions (0)	2018.04.26
TensorFlow Way of LinearRegression (0)	2018.04.26

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

게으른 우루루

Logistic Regression

'Tensorflow > Linear Regression' 카테고리의 다른 글

+ Recent posts

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역