#변수의 종류를 확인하고 인코딩까지 해보자 

import numpy as np
import pandas as pd #넘파이와 판다스 어쩌구는 시간이 지나면 해결해줄 것이라 믿는다

data = {
    'Name':['John', 'Sabre', 'Kim', 'Sato', 'Lee', 'Smith', 'David', 'Park'],
    'Country':['USA', 'France', 'Korea', None, 'Korea', 'UK', 'USA', 'Korea'],
    'Age':['31', 33, None, 40, 36, 55, np.nan, 35], # numerical인데 categorical처럼 인식될 수 있음
    'Job':['Student', np.nan, 'Developer', 'Chef', 'Professor', 'CEO', 'Banker', 'Student'],
    'Hand':['L', 'R', 'R', 'B', 'L', 'L', 'R', 'R'],
    'Height':['T', 'S', 'M', 'S', 'T', 'S', 'S', 'T'],
    'Capital':[48.35, 150.8, 99.0, 100.0, 182.3, 1101.65, 131.87, 65.8]
}
#강사님이 주신 데이터 

df_nan = pd.DataFrame(data)
df = df_nan.copy()
print(df)

    Name Country   Age        Job Hand Height  Capital
0   John     USA    31    Student    L      T    48.35
1  Sabre  France    33        NaN    R      S   150.80
2    Kim   Korea  None  Developer    R      M    99.00
3   Sato    None    40       Chef    B      S   100.00
4    Lee   Korea    36  Professor    L      T   182.30
5  Smith      UK    55        CEO    L      S  1101.65
6  David     USA   NaN     Banker    R      S   131.87
7   Park   Korea    35    Student    R      T    65.80


df.info() #각 열의 데이터 타입을 확인해 볼 수 있다

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Name     8 non-null      object 
 1   Country  7 non-null      object 
 2   Age      6 non-null      object 
 3   Job      7 non-null      object 
 4   Hand     8 non-null      object 
 5   Height   8 non-null      object 
 6   Capital  8 non-null      float64
dtypes: float64(1), object(6)
memory usage: 576.0+ bytes


#나이 데이터를 다루려면 Age의 type을 int로 바꿔줄 필요가 있다. 
df['Age'] = df['Age'].astype('Float32').astype('Int16') #실수로 바꿔주고 (flt은 null값 받아들이기 가능) 정수화시키기 

df
#None과 NaN의 차이는 나중에 알아보기로 하자.


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Name     8 non-null      object 
 1   Country  7 non-null      object 
 2   Age      6 non-null      Int16  
 3   Job      7 non-null      object 
 4   Hand     8 non-null      object 
 5   Height   8 non-null      object 
 6   Capital  8 non-null      float64
dtypes: Int16(1), float64(1), object(5)
memory usage: 536.0+ bytes


cat_cols = ['Name', 'Country', 'Job', 'Hand', 'Height']
num_cols = ['Age', 'Capital'] 
print(cat_cols)
print(num_cols)

['Name', 'Country', 'Job', 'Hand', 'Height']
['Age', 'Capital']


df[cat_cols] 
df_enc = df.copy() #원본은 복사본 만들어두고


from sklearn.preprocessing import OrdinalEncoder 

# Hand 칼럼 ordinal [+]
ord_enc=OrdinalEncoder()
ord_enc.fit(df[['Hand']])
ord_enc.transform(df[['Hand']])

array([[1.],
       [2.],
       [2.],
       [0.],
       [1.],
       [1.],
       [2.],
       [2.]])


df_enc['Hand'] = ord_enc.transform(df[['Hand']])
df_enc


from sklearn.preprocessing import OneHotEncoder

# Hand 칼럼 onehot [+]
oh_enc = OneHotEncoder(sparse=False)
oh_enc.fit(df[['Hand']])
oh_enc.transform(df[['Hand']]) 
oh_enc.get_feature_names_out()

array(['Hand_B', 'Hand_L', 'Hand_R'], dtype=object)


df_enc = pd.concat( [df_enc, pd.DataFrame(oh_enc.transform(df[['Hand']]), 
                    columns=oh_enc.get_feature_names_out())], axis=1).drop('Hand', axis=1) #원래 Hands열은 버려준다
df_enc


ord_enc.fit(df[['Height']])
ord_enc.transform(df[['Height']])

array([[2.],
       [1.],
       [0.],
       [1.],
       [2.],
       [1.],
       [1.],
       [2.]])


ord_enc = OrdinalEncoder(categories=[['S', 'M', 'T']])
ord_enc.fit(df[['Height']])
ord_enc.transform(df[['Height']])


결과 확인은 이렇게.


df_enc['Height'] = ord_enc.transform(df[['Height']])
df_enc

인공지능 이산수학 실습 코드 내용정리¶

사족¶

변수의 종류 확인하기¶

1. Categorical - 범주형¶

2. Numerical - 숫자형 혹은 수치형 : 연산이 가능!¶

사족¶

	Name	Country	Age	Job	Hand	Height	Capital
0	John	USA	31	Student	L	T	48.35
1	Sabre	France	33	NaN	R	S	150.80
2	Kim	Korea	<NA>	Developer	R	M	99.00
3	Sato	None	40	Chef	B	S	100.00
4	Lee	Korea	36	Professor	L	T	182.30
5	Smith	UK	55	CEO	L	S	1101.65
6	David	USA	<NA>	Banker	R	S	131.87
7	Park	Korea	35	Student	R	T	65.80