pandas & duckdb
자료 구조
wefree
2022. 8. 8. 23:58
https://wikidocs.net/book/4639 참고
import numpy as np
import pandas as pd
arr = np.arange(100, 105)
s = pd.Series(arr, dtype='int32')
s = pd.Series(['A', 'B', 'C'])
s.index # RangeIndex(start=0, stop=3, step=1)
s[0] # 'A'
# s[-1] # Exception 발생
s = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'])
s['a'] # 'A'
s[-1] # index 를 지정한 이후에는 -1 사용 가능
s.index # Index(['a', 'b', 'c'], dtype='object')
s.index = ['d', 'e', 'f']
s
# d A
# e B
# f C
# dtype: object
s = pd.Series(['A', 'B', 'C'])
s.values # array(['A', 'B', 'C'], dtype=object)
s.ndim # 1
s.shape # (3,)
type((3,)) # tuple
type((3)) # int
s = pd.Series(['A', np.nan, 'B', 'C'])
s
# 0 A
# 1 NaN
# 2 B
# 3 C
# dtype: object
# fancy indexing
s = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'])
s['a'] # 'A'
s[['a', 'c']]
# a A
# c C
# dtype: object
# boolean indexing
s = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'])
s[[True, False, True]]
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
cond = s >= 3
cond
# a False
# b False
# c True
# d True
# e True
# dtype: bool
s[cond]
s[s >= 3]
s[(s >= 3) & (s <= 4)]
s[(s <= 1) | (s >= 4)]
s = pd.Series(['A', np.nan, 'B', 'C'])
s.isnull()
# 0 False
# 1 True
# 2 False
# 3 False
s[s.isnull()]
s[s.notnull()]
s = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'])
s[0:2]
# a A
# b B
s['a':'c']
# a A
# b B
# c C <--- c 가 포함됨
##################################################################
df = pd.DataFrame([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]],
columns=['A', 'B', 'C'])
df
# A B C
# 0 1 2 3
# 1 4 5 6
# 2 7 8 9
df = pd.DataFrame(
{
'name': ['A', 'B', 'C'],
'age': [1, 2, 3],
'children': [4, 5, 6]
})
df
# name age children
# 0 A 1 4
# 1 B 2 5
# 2 C 3 6
df.index # RangeIndex(start=0, stop=3, step=1)
df.columns # Index(['name', 'age', 'children'], dtype='object')
df.values
df.dtypes
# name object
# age int64
# children int64
df.T
# 0 1 2
# name A B C
# age 1 2 3
# children 4 5 6
df = pd.DataFrame(
{
'name': ['A', 'B', 'C'],
'age': [1, 2, 3],
'children': [4, 5, 6]
})
df.index = ['a', 'b', 'c']
df
# name age children
# a A 1 4
# b B 2 5
# c C 3 6
df['name']
# a A
# b B
# c C
type(df['name']) # pandas.core.series.Series
df[['name', 'children']]
df = pd.DataFrame(
{
'name': ['A', 'B', 'C'],
'age': [1, 2, 3],
'children': [4, 5, 6]
})
df.rename(columns={'name': '이름'})
# 이름 age children
# 0 A 1 4
# 1 B 2 5
# 2 C 3 6
df # df 원본 데이터는 바뀌지 않았다 !!!
# name age children
# 0 A 1 4
# 1 B 2 5
# 2 C 3 6
# 변경 사항을 원본까지 적용할려면 inplace=True 를 준다.
df.rename(columns={'name': '이름'}, inplace=True)