자료 구조

pandas & duckdb
자료 구조

wefree 2022. 8. 8. 23:58
import numpy as np
import pandas as pd

arr = np.arange(100, 105)
s = pd.Series(arr, dtype='int32')

s = pd.Series(['A', 'B', 'C'])

s.index  # RangeIndex(start=0, stop=3, step=1)
s[0]  # 'A'
# s[-1] # Exception 발생

s = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'])
s['a']  # 'A'
s[-1]  # index 를 지정한 이후에는 -1 사용 가능
s.index  # Index(['a', 'b', 'c'], dtype='object')
s.index = ['d', 'e', 'f']
s
# d    A
# e    B
# f    C
# dtype: object


s = pd.Series(['A', 'B', 'C'])
s.values  # array(['A', 'B', 'C'], dtype=object)
s.ndim  # 1
s.shape  # (3,)
type((3,))  # tuple
type((3))  # int

s = pd.Series(['A', np.nan, 'B', 'C'])
s
# 0      A
# 1    NaN
# 2      B
# 3      C
# dtype: object

# fancy indexing
s = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'])
s['a']  # 'A'
s[['a', 'c']]
# a    A
# c    C
# dtype: object

# boolean indexing
s = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'])
s[[True, False, True]]

s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
cond = s >= 3
cond
# a    False
# b    False
# c     True
# d     True
# e     True
# dtype: bool
s[cond]
s[s >= 3]
s[(s >= 3) & (s <= 4)]
s[(s <= 1) | (s >= 4)]

s = pd.Series(['A', np.nan, 'B', 'C'])
s.isnull()
# 0    False
# 1     True
# 2    False
# 3    False
s[s.isnull()]
s[s.notnull()]

s = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'])
s[0:2]
# a    A
# b    B
s['a':'c']
# a    A
# b    B
# c    C  <--- c 가 포함됨

##################################################################

df = pd.DataFrame([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9]],
                  columns=['A', 'B', 'C'])
df
#    A  B  C
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9

df = pd.DataFrame(
    {
        'name': ['A', 'B', 'C'],
        'age': [1, 2, 3],
        'children': [4, 5, 6]
    })
df
#   name  age  children
# 0    A    1         4
# 1    B    2         5
# 2    C    3         6

df.index  # RangeIndex(start=0, stop=3, step=1)
df.columns  # Index(['name', 'age', 'children'], dtype='object')
df.values
df.dtypes
# name        object
# age          int64
# children     int64
df.T
#           0  1  2
# name      A  B  C
# age       1  2  3
# children  4  5  6


df = pd.DataFrame(
    {
        'name': ['A', 'B', 'C'],
        'age': [1, 2, 3],
        'children': [4, 5, 6]
    })
df.index = ['a', 'b', 'c']
df
#   name  age  children
# a    A    1         4
# b    B    2         5
# c    C    3         6

df['name']
# a    A
# b    B
# c    C
type(df['name'])  # pandas.core.series.Series
df[['name', 'children']]

df = pd.DataFrame(
    {
        'name': ['A', 'B', 'C'],
        'age': [1, 2, 3],
        'children': [4, 5, 6]
    })
df.rename(columns={'name': '이름'})
#   이름  age  children
# 0  A    1         4
# 1  B    2         5
# 2  C    3         6

df  # df 원본 데이터는 바뀌지 않았다 !!!
#    name  age  children
# 0    A    1         4
# 1    B    2         5
# 2    C    3         6

# 변경 사항을 원본까지 적용할려면 inplace=True 를 준다.
df.rename(columns={'name': '이름'}, inplace=True)
참고: https://www.udemy.com/course/pandas-i/