01_data_slicing
In [1]:
import pandas as pd
In [2]:
df = pd.read_csv("./data/gapminder.tsv", sep="\t")


데이터 확인하기

In [3]:
print(df.head()) # 앞부분
print("\n{}".format(type(df)))
print("\n{}".format(df.shape)) # shape
print("\n{}".format(df.columns)) # shape
print("\n{}\n".format(df.dtypes)) # columns type
print("{}".format(df.info())) # info
       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106

<class 'pandas.core.frame.DataFrame'>

(1704, 6)

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB
None


데이터 추출

In [4]:
## columns 1개 추출
country_df = df["country"]
print(type(country_df))
print(country_df.head())
print("\n==============================\n")
print(country_df.tail())
<class 'pandas.core.series.Series'>
0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

==============================

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object


columns 3개 추출 "country", "continent", "year"

  • type
  • head
  • tail
In [5]:
subset = df[["country", "continent", "year"]]

print(subset.dtypes) ## type
print("\n==============================\n")
print(subset.head()) ## head
print("\n==============================\n")
print(subset.tail()) ## tail
country      object
continent    object
year          int64
dtype: object

==============================

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972

==============================

       country continent  year
1699  Zimbabwe    Africa  1987
1700  Zimbabwe    Africa  1992
1701  Zimbabwe    Africa  1997
1702  Zimbabwe    Africa  2002
1703  Zimbabwe    Africa  2007


loc 속성으로 데이터 추출

  • 인덱스가 음이 아닌 정수 사용
  • 인덱스 vs 행번호
In [6]:
## 인덱스가 0, 99인 데이터 추출

print(df.loc[0])
print("\n==============================\n")
print(df.loc[99])
country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

==============================

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object
In [7]:
## 마지막 데이터 추출, shape()이용
### shape는 정수로서 실제크기를 말하지만
### pandas의 인덱스는 0부터 시작하기때문에 1을 빼줘야함

last_row = df.shape[0]-1
print(df.loc[last_row])

print("\n==============================\n")
## 마지막 데이터 추출, tail()이용
### n 인자 사용
print(df.tail(n=1))
country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object

==============================

       country continent  year  lifeExp       pop   gdpPercap
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298
In [8]:
### 인덱스가 0, 99, 999인 데이터를 추출
### list사용

print(df.loc[[0, 99, 999]])
         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130


iloc 속성으로 데이터 추출

  • loc는 데이터프레임의 인덱스를 사용하여 데이터를 추출
  • iloc는 행번호를 사용하여 데이터를 추출
In [9]:
print(df.iloc[1])
print("\n==============================\n")
print(df.iloc[99])
country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap        820.853
Name: 1, dtype: object

==============================

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object
In [10]:
### iloc에 -1은 행 데이터를 추출
print(df.iloc[-1])
print(df.iloc[1710])
country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object

IndexErrorTraceback (most recent call last)
<ipython-input-10-8a802f6f3585> in <module>
      1 ### iloc에 -1은 행 데이터를 추출
      2 print(df.iloc[-1])
----> 3 print(df.iloc[1710])

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1476 
   1477             maybe_callable = com._apply_if_callable(key, self.obj)
-> 1478             return self._getitem_axis(maybe_callable, axis=axis)
   1479 
   1480     def _is_scalar_access(self, key):

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   2100 
   2101             # validate the location
-> 2102             self._validate_integer(key, axis)
   2103 
   2104             return self._get_loc(key, axis=axis)

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_integer(self, key, axis)
   2007         l = len(ax)
   2008         if key >= l or key < -l:
-> 2009             raise IndexError("single positional indexer is out-of-bounds")
   2010 
   2011     def _getitem_tuple(self, tup):

IndexError: single positional indexer is out-of-bounds
In [11]:
print(df.iloc[[0, 99, 999]])
         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130
In [12]:
### : -- 모든행
subset = df.loc[:, ["year", "pop"]]
print(subset.head())

subset = df.iloc[:, [2, 4, -1]]
print(subset.head())
   year       pop
0  1952   8425333
1  1957   9240934
2  1962  10267083
3  1967  11537966
4  1972  13079460
   year       pop   gdpPercap
0  1952   8425333  779.445314
1  1957   9240934  820.853030
2  1962  10267083  853.100710
3  1967  11537966  836.197138
4  1972  13079460  739.981106
In [13]:
small_range = range(5) ## 

subset = df.iloc[:, small_range]
print(subset.head())

print("\n==============================\n")
range2 = list(range(3, 6))
subset = df.iloc[:, range2]
print(subset.head())
       country continent  year  lifeExp       pop
0  Afghanistan      Asia  1952   28.801   8425333
1  Afghanistan      Asia  1957   30.332   9240934
2  Afghanistan      Asia  1962   31.997  10267083
3  Afghanistan      Asia  1967   34.020  11537966
4  Afghanistan      Asia  1972   36.088  13079460

==============================

   lifeExp       pop   gdpPercap
0   28.801   8425333  779.445314
1   30.332   9240934  820.853030
2   31.997  10267083  853.100710
3   34.020  11537966  836.197138
4   36.088  13079460  739.981106
In [14]:
range3 = list(range(0, 6, 2))
subset = df.iloc[:, range3]
print(subset.head())
print("\n==============================\n")
subset = df.iloc[:, 0:6:2]
print(subset.head())
       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

==============================

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460
In [15]:
### loc, iloc, 데이터셋이 클 수록 loc 속성이 유리
print(df.iloc[[0, 99, 999], [0, 3, 5]])
print(df.loc[[0, 99, 999], ["country", "lifeExp", "gdpPercap"]])
         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130
         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130
In [16]:
from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

'pandas > basic' 카테고리의 다른 글

06.handling_dataframe(bool)  (0) 2018.12.09
05.handling_series(apply)  (0) 2018.12.09
04.handling_series(basic)  (0) 2018.12.09
03.create_data_frame  (0) 2018.12.09
02.basic_statistic  (0) 2018.12.09

+ Recent posts