#!pip uninstall mp2024pkg -y
#!pip install git+https://github.com/guebin/mp2024pkg.git10wk-1: Dataset 클래스
1. 강의영상
2. Imports
import pandas as pd
import numpy as np
import datasets
import transformers
import torch
from mp2024pkg import signature, show
from rich import print as rprint/home/cgb3/anaconda3/envs/hf/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
emotion = datasets.load_dataset('emotion')
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")3. Dataset 형식이해
- emotion 데이터셋 일부를 d로 선언
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
- d 는 아래와 같이 length-\(n\) 인 list로 이해하는것이 편리하다.
- dataset = [example_1, example_2, example_3 , example_4]
- example_i = {‘text’: xxx, ‘label’: yyy}
dDataset({
features: ['text', 'label'],
num_rows: 4
})
[d[0],d[1],d[2],d[3]][{'text': 'i didnt feel humiliated', 'label': 0},
{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'label': 0},
{'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3},
{'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
'label': 2}]
d.to_list()[{'text': 'i didnt feel humiliated', 'label': 0},
{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'label': 0},
{'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3},
{'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
'label': 2}]
- 그런데 Dataset은 특이하게도 아래의 문법이 동작했었다.
d['text']['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'im grabbing a minute to post i feel greedy wrong',
'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']
d['label'][0, 0, 3, 2]
- 위의 결과를 관찰하면 Dataset 는 마치 dictionary 처럼 느껴진다. 실제로 경우에 따라서 Dataset을 dictionary 처럼 생각해도 된다. 이때 Dataset은 아래와 같은 구조로 이해하는게 편리하다.
- d = examples = {‘text’:[xxx,xxxx,xxxxx,xxxxxx], ‘label’:[yyy,yyyy,yyyyy,yyyyyy]}
dct = {'text': d['text'], 'label':d['label']}
dct{'text': ['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'im grabbing a minute to post i feel greedy wrong',
'i am ever feeling nostalgic about the fireplace i will know that it is still on the property'],
'label': [0, 0, 3, 2]}
d.to_dict(){'text': ['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'im grabbing a minute to post i feel greedy wrong',
'i am ever feeling nostalgic about the fireplace i will know that it is still on the property'],
'label': [0, 0, 3, 2]}
- 복습: 딕셔너리를 데이터프레임과 비슷하게 생각할 수 있었음.
pd.DataFrame({'a':[1,2,3], 'b':[2,3,4]})| a | b | |
|---|---|---|
| 0 | 1 | 2 |
| 1 | 2 | 3 |
| 2 | 3 | 4 |
- 이 개념을 확장하면 d역시 데이터프레임과 비슷하게 이해할 수도 있다.
pd.DataFrame(d.to_dict())| text | label | |
|---|---|---|
| 0 | i didnt feel humiliated | 0 |
| 1 | i can go from feeling so hopeless to so damned... | 0 |
| 2 | im grabbing a minute to post i feel greedy wrong | 3 |
| 3 | i am ever feeling nostalgic about the fireplac... | 2 |
d.to_pandas()| text | label | |
|---|---|---|
| 0 | i didnt feel humiliated | 0 |
| 1 | i can go from feeling so hopeless to so damned... | 0 |
| 2 | im grabbing a minute to post i feel greedy wrong | 3 |
| 3 | i am ever feeling nostalgic about the fireplac... | 2 |
4. 쉬운 함수들
A. .select()
# 예시1
#d = datasets.Dataset.from_list([emotion['train'][0],emotion['train'][1],emotion['train'][2],emotion['train'][3]])
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
#
B. .shuffle()
# 예시1
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
show(d.shuffle())List Overview:
Total items: 4
1. list[0]
- Type: dict
- Length: 2
- Values: {'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'label': 2}
2. list[1]
- Type: dict
- Length: 2
- Values: {'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3}
3. list[2]
- Type: dict
- Length: 2
- Values: {'text': 'i didnt feel humiliated', 'label': 0}
4. list[3]
- Type: dict
- Length: 2
- Values: {'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'label': 0}
#
# 예시2
# emotion['train']에서 처음 4개의 observation/example 을 뽑는 코드
d = emotion['train'].select(range(4))
d Dataset({
features: ['text', 'label'],
num_rows: 4
})
# emotion['train']에서 랜덤으로 4개의 observation/example 을 뽑는 코드
d = emotion['train'].shuffle().select(range(4))
d Dataset({
features: ['text', 'label'],
num_rows: 4
})
#
C. .select_columns()
# 예시1
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.select_columns(['text'])Dataset({
features: ['text'],
num_rows: 4
})
d.select_columns(['label'])Dataset({
features: ['label'],
num_rows: 4
})
d.select_columns(['text','label'])Dataset({
features: ['text', 'label'],
num_rows: 4
})
#
D. .set_format()
# 예시1
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.set_format(type="pandas")d['text']0 i didnt feel humiliated
1 i can go from feeling so hopeless to so damned...
2 im grabbing a minute to post i feel greedy wrong
3 i am ever feeling nostalgic about the fireplac...
Name: text, dtype: object
d['label']0 0
1 0
2 3
3 2
Name: label, dtype: int64
#
# 예시2
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.set_format(type="pandas",columns=['label'])d['text']['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'im grabbing a minute to post i feel greedy wrong',
'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']
d['label']0 0
1 0
2 3
3 2
Name: label, dtype: int64
#
# 예시3
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.set_format(type="pt",columns=['label'])d['text']['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'im grabbing a minute to post i feel greedy wrong',
'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']
d['label']tensor([0, 0, 3, 2])
#
E. .reset_format()
# 예시1
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.set_format(type="pt",columns=['label'])d['text']['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'im grabbing a minute to post i feel greedy wrong',
'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']
d['label']tensor([0, 0, 3, 2])
d.reset_format()d['text']['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'im grabbing a minute to post i feel greedy wrong',
'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']
d['label'][0, 0, 3, 2]
#
5. .map()
A. d.map()
# 예제1 – .map()에 대한 이해
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.map()을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 |
|---|---|---|
| d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
| d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
(풀이1) – d.map()을 사용하지 않은 풀이..
d는 아래와 같은 구조로 이해할 수 있음
- d = [example_1, example_2, example_3, example_4]
- example_i = {‘text’: xxx, ‘label’ = yyy}
리스트화
lst = d.to_list()
lst[{'text': 'i didnt feel humiliated', 'label': 0},
{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'label': 0},
{'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3},
{'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
'label': 2}]
리스트의 첫 요소에 변환적용
l = lst[0]
l{'text': 'i didnt feel humiliated', 'label': 0}
r = tokenizer(l['text'])
r{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
l와 tokenizer(l['text'])을 합침
l{'text': 'i didnt feel humiliated', 'label': 0}
r{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
l|r {'text': 'i didnt feel humiliated', 'label': 0, 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
반복
lst2 = [l | tokenizer(l['text']) for l in lst]
lst2[{'text': 'i didnt feel humiliated', 'label': 0, 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]},
{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'label': 0, 'input_ids': [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
{'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3, 'input_ids': [101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505, 3308, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
{'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'label': 2, 'input_ids': [101, 1045, 2572, 2412, 3110, 16839, 9080, 12863, 2055, 1996, 13788, 1045, 2097, 2113, 2008, 2009, 2003, 2145, 2006, 1996, 3200, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}]
lst2를 d2로..
d2 = datasets.Dataset.from_list(lst2)
d2Dataset({
features: ['text', 'label', 'input_ids', 'attention_mask'],
num_rows: 4
})
d2[0]{'text': 'i didnt feel humiliated',
'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[:1]{'text': ['i didnt feel humiliated'],
'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
(풀이2)
def m_transform(example):
# example = l = {'text': 'i didnt feel humiliated', 'label': 0}
result = tokenizer(example['text'])
return result d2 = d.map(m_transform)
d2[0]{'text': 'i didnt feel humiliated',
'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[:1]{'text': ['i didnt feel humiliated'],
'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
#
.map()의 특징
- 특징1:
m_transform()은 입력으로example = {'text':xxx, 'label':yyy}꼴을 가정한다. - 특징2:
.map()은 변환전 dict와 변환후 dict를 합친다.
B. dd.map()
# 예제1 – dd에도 .map을 적용할 수 있음
아래와 같은 DatasetDict가 있다고 하자.
dd = datasets.DatasetDict({
'train':emotion['train'].select(range(4)),
'test':emotion['test'].select(range(4)),
})
ddDatasetDict({
train: Dataset({
features: ['text', 'label'],
num_rows: 4
})
test: Dataset({
features: ['text', 'label'],
num_rows: 4
})
})
dd.map()을 이용하여 아래와 같이 변환하라.
| tr/test | 데이터 | 변환전 | 변환후 |
|---|---|---|---|
| train | d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
| train | d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
| test | d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
| test | d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
(풀이)
def m_transform(example):
# example = l = {'text': 'i didnt feel humiliated', 'label': 0}
result = tokenizer(example['text'])
return result dd2 = dd.map(m_transform)dd2['train'][0]{'text': 'i didnt feel humiliated',
'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
dd2['train'][:1]{'text': ['i didnt feel humiliated'],
'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
dd2['test'][0]{'text': 'im feeling rather rotten so im not very ambitious right now',
'label': 0,
'input_ids': [101,
10047,
3110,
2738,
11083,
2061,
10047,
2025,
2200,
12479,
2157,
2085,
102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
dd2['test'][:1]{'text': ['im feeling rather rotten so im not very ambitious right now'],
'label': [0],
'input_ids': [[101,
10047,
3110,
2738,
11083,
2061,
10047,
2025,
2200,
12479,
2157,
2085,
102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
#
C. d.map(batch=True)
# 예제1 – d.map(batch=True)의 이해
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(8))
dDataset({
features: ['text', 'label'],
num_rows: 8
})
d.map(batch=True)을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 | 특이사항 |
|---|---|---|---|
| d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
변환시 2개의 example씩 묶어서 패딩 |
| d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
변환시 2개의 example씩 묶어서 패딩 |
(풀이1) – 실패
def m_transform(example):
# example = {'text':xxx, 'label':yyy}
result = tokenizer(example['text'],padding=True)
return result d2 = d.map(m_transform)
d2Dataset({
features: ['text', 'label', 'input_ids', 'attention_mask'],
num_rows: 8
})
rprint("d2[:4]['input_ids']")
show(d2[:4]['input_ids'])d2[:4]['input_ids']
List Overview:
Total items: 4
1. list[0]
- Type: list
- Length: 7
- Values: [101, 1045, 2134, 2102, 2514, 26608, 102]
2. list[1]
- Type: list
- Length: 23
- Values: [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]
3. list[2]
- Type: list
- Length: 12
- Values: [101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505, 3308, 102]
4. list[3]
- Type: list
- Length: 22
- Values: [101, 1045, 2572, 2412, 3110, 16839, 9080, 12863, 2055, 1996, 13788, 1045, 2097, 2113, 2008, 2009, 2003, 2145, 2006, 1996, 3200, 102]
(풀이2) – 성공
# def m_transform(example):
# # example = {'text':xxx, 'label':yyy}
# result = tokenizer(example['text'], padding=True)
# return result
def m_transform_batch(example_batch):
# example_batch = {'text':[xxx,xxxx], 'label':[yyy,yyyy]}
result = tokenizer(example_batch['text'], padding=True)
return result d2 = d.map(m_transform_batch,batched=True,batch_size=2)
d2Dataset({
features: ['text', 'label', 'input_ids', 'attention_mask'],
num_rows: 8
})
rprint("d2[:4]['input_ids']")
show(d2[:4]['input_ids'])d2[:4]['input_ids']
List Overview:
Total items: 4
1. list[0]
- Type: list
- Length: 23
- Values: [101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2. list[1]
- Type: list
- Length: 23
- Values: [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]
3. list[2]
- Type: list
- Length: 22
- Values: [101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505, 3308, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4. list[3]
- Type: list
- Length: 22
- Values: [101, 1045, 2572, 2412, 3110, 16839, 9080, 12863, 2055, 1996, 13788, 1045, 2097, 2113, 2008, 2009, 2003, 2145, 2006, 1996, 3200, 102]
#
.map(batch=True)의 특징
- 특징1:
m_transform_batch()은 입력으로example_batch = {'text':[xxx,xxxx,...], 'label':[yyy,yyyy,...]}꼴을 가정한다. - 특징2:
example_batch는batch_size만큼 데이터가 있다고 생각한다.
D. d.map() + 칼럼선택
# 예제1 – attention_mask 제외
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.map()을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 |
|---|---|---|
| d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] |
| d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] |
(풀이1)
def m_transform(example):
# example = {'text':xxx, 'label':yyy}
result = tokenizer(example['text'])
return resultd2 = d.map(m_transform)
d2 = d2.select_columns(['text', 'label', 'input_ids'])
d2Dataset({
features: ['text', 'label', 'input_ids'],
num_rows: 4
})
d2[0]{'text': 'i didnt feel humiliated',
'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102]}
d2[:1]{'text': ['i didnt feel humiliated'],
'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]]}
(풀이2)
def m_transform(example):
# example = {'text':xxx, 'label':yyy}
result = tokenizer(example['text'])
del result['attention_mask']
return resultd2 = d.map(m_transform)
d2Dataset({
features: ['text', 'label', 'input_ids'],
num_rows: 4
})
d2[0]{'text': 'i didnt feel humiliated',
'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102]}
d2[:1]{'text': ['i didnt feel humiliated'],
'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]]}
#
# 예제2 – text 제외
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.map()을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 |
|---|---|---|
| d[0] | text: str label: int |
label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
| d[:1] | text: [str] label: [int] |
label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
(풀이1)
def m_transform(example):
# example = {'text': xxx, 'label':yyy}
result = tokenizer(example['text'])
del example['text']
return resultd2 = d.map(m_transform)
d2Dataset({
features: ['label', 'input_ids', 'attention_mask'],
num_rows: 4
})
d2[0]{'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[:1]{'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
(풀이2)
def m_transform(example):
# example = {'text': xxx, 'label':yyy}
result = tokenizer(example['text'])
return resultd2 = d.map(m_transform)
d2 = d2.select_columns(['label', 'input_ids', 'attention_mask'])
d2 Dataset({
features: ['label', 'input_ids', 'attention_mask'],
num_rows: 4
})
d2[0]{'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[:1]{'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
#
.map()에서 컬럼을 제외하려면?
del을 이용한 풀이: 제외하고자 하는 column이example에 있을 경우,result에 있을 경우 미묘하게 다름.select를 이용한 풀이: 제외하고자 하는 column이example에 있든지result에 있든지 상관없음.
E. d.map() + 타입변환 (\(\star\))
# 예제1 – .map()을 이용한 타입변환은 불가능
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.map()을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 |
|---|---|---|
| d[0] | text: str label: int |
label: int input_ids: tensor([int,…,int]) attention_mask: tensor([int,…,int]) |
| d[:1] | text: [str] label: [int] |
label:[int] input_ids: tensor([[int,…,int]]) attention_mask: tensor([[int,…,int]]) |
| d[:2] | text: [str,str] label: [int,int] |
label:[int,int] input_ids: tensor([[int,…,int],[int,…,int]]) attention_mask: tensor([[int,…,int],[int,…,int]]) |
(풀이1) – 실패
def m_transform(example):
# example = {'text': xxx, 'label':yyy}
result = tokenizer(example['text'])
del example['text']
result['input_ids'] = torch.tensor(result['input_ids'])
result['attention_mask'] = torch.tensor(result['attention_mask'])
return resultd2 = d.map(m_transform)d2[0]{'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[:1]{'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
(풀이2) – 실패
def m_transform(example):
# example = {'text': xxx, 'label':yyy}
result = tokenizer(example['text'],return_tensors='pt')
del example['text']
return resultd2 = d.map(m_transform)
d2Dataset({
features: ['label', 'input_ids', 'attention_mask'],
num_rows: 4
})
d2[0]{'label': 0,
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
d2[:1]{'label': [0],
'input_ids': [[[101, 1045, 2134, 2102, 2514, 26608, 102]]],
'attention_mask': [[[1, 1, 1, 1, 1, 1, 1]]]}
도데체 왜 자료형을 안바꿔주는거야??
(풀이3) – 이것도 실패한다고?
lst = d.to_list()
lst[{'text': 'i didnt feel humiliated', 'label': 0},
{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'label': 0},
{'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3},
{'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
'label': 2}]
lst2 = []
for l in lst:
result = tokenizer(l['text'])
result['input_ids'] = torch.tensor(result['input_ids'])
result['attention_mask'] = torch.tensor(result['attention_mask'])
del l['text']
lst2.append(l|result)
lst2[{'label': 0, 'input_ids': tensor([ 101, 1045, 2134, 2102, 2514, 26608, 102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1])},
{'label': 0, 'input_ids': tensor([ 101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061,
9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998,
2003, 8300, 102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])},
{'label': 3, 'input_ids': tensor([ 101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505,
3308, 102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])},
{'label': 2, 'input_ids': tensor([ 101, 1045, 2572, 2412, 3110, 16839, 9080, 12863, 2055, 1996,
13788, 1045, 2097, 2113, 2008, 2009, 2003, 2145, 2006, 1996,
3200, 102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}]
d2 = datasets.Dataset.from_list(lst2)
d2Dataset({
features: ['label', 'input_ids', 'attention_mask'],
num_rows: 4
})
d2[0]{'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[:1]{'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
(풀이4) – 성공??
def m_transform(example):
# example = {'text': xxx, 'label': yyy}
result = tokenizer(example['text'])
del example['text']
return resultd2 = d.map(m_transform)
d2.set_format(type="pt",columns=['input_ids','attention_mask'],output_all_columns=True)d2[0]{'input_ids': tensor([ 101, 1045, 2134, 2102, 2514, 26608, 102]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
'label': 0}
d2[:1]{'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102]]),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]),
'label': [0]}
d2[:2]['input_ids'] [tensor([ 101, 1045, 2134, 2102, 2514, 26608, 102]),
tensor([ 101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061,
9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998,
2003, 8300, 102])]
#
6. .with_transform()
A. d.with_transform()
# 예제1
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.with_transform()을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 |
|---|---|---|
| d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
| d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
(풀이1)
dct = d.to_dict()
dct{'text': ['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
'im grabbing a minute to post i feel greedy wrong',
'i am ever feeling nostalgic about the fireplace i will know that it is still on the property'],
'label': [0, 0, 3, 2]}
result = tokenizer(d['text'])
dct2 = dct | result
d2 = datasets.Dataset.from_dict(dct2)d2[0]{'text': 'i didnt feel humiliated',
'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[:1]{'text': ['i didnt feel humiliated'],
'label': [0],
'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
(풀이2)
def w_transform(examples):
#examples = {'text':[xxx,xxxx,...], 'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'])
result = examples | result
return resultd2 = d.with_transform(w_transform)
d2Dataset({
features: ['text', 'label'],
num_rows: 4
})
d2[0]{'text': 'i didnt feel humiliated',
'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[0:1]{'text': ['i didnt feel humiliated'], 'label': [0], 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
#
.with_transform()와 .map()의 차이점
.map()은 입력으로 example꼴을,.with_transform()은 입력으로 examples를 기대한다..map()은 변환전과 변환후 데이터가 자동으로 합쳐진다..with_transform()은 변환후 데이터만 살아남는다..map()은 변환이 실제로 이루어진다..with_transform()은 변환이 실제로 이루어지지 않다가d[0],d[:1]등이 실행하는 순간 이루어진다.
# 예제2
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.with_transform()을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 |
|---|---|---|
| d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] |
| d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] |
(풀이)
def w_transform(examples):
# examples = {'text':[xxx,xxxx,....], 'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'])
del result['attention_mask']
result['text'] = examples['text']
result['label'] = examples['label']
return resultd2 = d.with_transform(w_transform)
d2Dataset({
features: ['text', 'label'],
num_rows: 4
})
d2[0]{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'text': 'i didnt feel humiliated',
'label': 0}
d2[:1]{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'text': ['i didnt feel humiliated'], 'label': [0]}
#
# 예제3
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.with_transform()을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 |
|---|---|---|
| d[0] | text: str label: int |
label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
| d[:1] | text: [str] label: [int] |
label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
(풀이)
def w_transform(examples):
# examples = {'text':[xxx,xxxx,....], 'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'])
result['label'] = examples['label']
return resultd2 = d.with_transform(w_transform)
d2Dataset({
features: ['text', 'label'],
num_rows: 4
})
d2[0]{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1],
'label': 0}
d2[:1]{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 'label': [0]}
#
# 예제4
아래와 같은 Dataset이 있다고 하자.
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
d.with_transform()을 이용하여 아래와 같이 변환하라.
| 데이터 | 변환전 | 변환후 |
|---|---|---|
| d[0] | text: str label: int |
label: tensor(int) input_ids: tensor([int,…,int]) attention_mask: tensor([int,…,int]) |
| d[:1] | text: [str] label: [int] |
label: tensor([int]) input_ids: tensor([[int,…,int]]) attention_mask: tensor([[int,…,int]]) |
(풀이) – 실패
def w_transform(examples):
# examples = {'text':[xxx,xxxx,...],'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'])
result['label'] = torch.tensor(examples['label'])
result['input_ids'] = torch.tensor(result['input_ids'])
result['attention_mask'] = torch.tensor(result['attention_mask'])
return result d2 = d.with_transform(w_transform)
d2Dataset({
features: ['text', 'label'],
num_rows: 4
})
d2[0]{'input_ids': tensor([ 101, 1045, 2134, 2102, 2514, 26608, 102]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
'label': tensor(0)}
d2[:1]{'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0])}
d2[:2]ValueError: expected sequence of length 7 at dim 1 (got 23)
에러나는 이유
examples = d[:2]
examples{'text': ['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'],
'label': [0, 0]}
result = tokenizer(examples['text'])
result{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
torch.tensor(examples['label'])tensor([0, 0])
result['label'] = torch.tensor(examples['label'])
result{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'label': tensor([0, 0])}
torch.tensor(result['input_ids'])ValueError: expected sequence of length 7 at dim 1 (got 23)
- 패딩….
(풀이2) – 성공
def w_transform(examples):
# examples = {'text':[xxx,xxxx,...],'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'],padding=True)
result['label'] = torch.tensor(examples['label'])
result['input_ids'] = torch.tensor(result['input_ids'])
result['attention_mask'] = torch.tensor(result['attention_mask'])
return result d2 = d.with_transform(w_transform)
d2Dataset({
features: ['text', 'label'],
num_rows: 4
})
d2[0]{'input_ids': tensor([ 101, 1045, 2134, 2102, 2514, 26608, 102]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
'label': tensor(0)}
d2[:1]{'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0])}
d2[:2]{'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0],
[ 101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061,
9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998,
2003, 8300, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0, 0])}
(풀이3) – 이것도 성공..
def w_transform(examples):
# examples = {'text':[xxx,xxxx,...],'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'],padding=True,return_tensors="pt")
result['label'] = torch.tensor(examples['label'])
return result d2 = d.with_transform(w_transform)
d2Dataset({
features: ['text', 'label'],
num_rows: 4
})
d2[0]{'input_ids': tensor([ 101, 1045, 2134, 2102, 2514, 26608, 102]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
'label': tensor(0)}
d2[:1]{'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0])}
d2[:2]{'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0],
[ 101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061,
9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998,
2003, 8300, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0, 0])}
#
B. dd.with_transform()
# 예제1
아래와 같은 DatasetDict가 있다고 하자.
dd = datasets.DatasetDict({
'train':emotion['train'].select(range(4)),
'test':emotion['test'].select(range(4)),
})
ddDatasetDict({
train: Dataset({
features: ['text', 'label'],
num_rows: 4
})
test: Dataset({
features: ['text', 'label'],
num_rows: 4
})
})
dd.map()을 이용하여 아래와 같이 변환하라.
| tr/test | 데이터 | 변환전 | 변환후 |
|---|---|---|---|
| train | d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
| train | d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
| test | d[0] | text: str label: int |
text: str label: int input_ids: [int,…,int] attention_mask: [int,…,int] |
| test | d[:1] | text: [str] label: [int] |
text: [str] label: [int] input_ids: [[int,…,int]] attention_mask: [[int,…,int]] |
(풀이)
def w_transform(examples):
# examples = {'text':[xxx,xxxx,...], 'label':[yyy,yyyy,....]}
result = tokenizer(examples['text'])
result = examples | result
return result dd2 = dd.with_transform(w_transform)
dd2DatasetDict({
train: Dataset({
features: ['text', 'label'],
num_rows: 4
})
test: Dataset({
features: ['text', 'label'],
num_rows: 4
})
})
dd2['train'][0]{'text': 'i didnt feel humiliated',
'label': 0,
'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
dd2['train'][:1]{'text': ['i didnt feel humiliated'], 'label': [0], 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
dd2['test'][0]{'text': 'im feeling rather rotten so im not very ambitious right now',
'label': 0,
'input_ids': [101,
10047,
3110,
2738,
11083,
2061,
10047,
2025,
2200,
12479,
2157,
2085,
102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
dd2['test'][:1]{'text': ['im feeling rather rotten so im not very ambitious right now'], 'label': [0], 'input_ids': [[101, 10047, 3110, 2738, 11083, 2061, 10047, 2025, 2200, 12479, 2157, 2085, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
#
C. d.reset_format()
# 예시1
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
def w_transform(examples):
# examples = {'text':[xxx,xxxx,...], 'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'])
return result d2 = d.with_transform(w_transform)
d2Dataset({
features: ['text', 'label'],
num_rows: 4
})
d2[0]{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
d2[:1]{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}
d2.reset_format()d2[0]{'text': 'i didnt feel humiliated', 'label': 0}
d2[0:1]{'text': ['i didnt feel humiliated'], 'label': [0]}
#
# 예시2
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
def w_transform(examples):
# examples = {'text':[xxx,xxxx,...], 'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'],padding=True)
result['label'] = examples['label']
return result d2 = d.with_transform(w_transform)
d2Dataset({
features: ['text', 'label'],
num_rows: 4
})
d2[0]{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1],
'label': 0}
d2[:1]{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 'label': [0]}
d2[:2]{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'label': [0, 0]}
d2.set_format(type="pt")d2[0]{'text': 'i didnt feel humiliated', 'label': tensor(0)}
d2[:1]{'text': ['i didnt feel humiliated'], 'label': tensor([0])}
d2[:2]{'text': ['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'],
'label': tensor([0, 0])}
#
.with_transform 은 .set_format 궁합이 안맞음
7. 미묘한 차이
# 예시1
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
def m_transform(example):
# example = {'text':xxx, 'label':yyy}
result = tokenizer(example['text'])
return result def w_transform(examples):
# examples = {'text':[xxx,xxxx,...] 'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'])
result = examples | result
return result d2 = d.map(m_transform)
d2.set_format(type='pt')
d2[:1]{'text': ['i didnt feel humiliated'],
'label': tensor([0]),
'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102]]),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
d3 = d.with_transform(w_transform)
d3.set_format(type='pt')
d3[:1]{'text': ['i didnt feel humiliated'], 'label': tensor([0])}
#
# 예시2
d = emotion['train'].select(range(4))
dDataset({
features: ['text', 'label'],
num_rows: 4
})
def m_transform(example):
# example = {'text':xxx, 'label':yyy}
result = tokenizer(example['text'])
return result def w_transform(examples):
# examples = {'text':[xxx,xxxx,...] 'label':[yyy,yyyy,...]}
result = tokenizer(examples['text'],return_tensors="pt")
result = examples | result
result['label'] = torch.tensor(result['label'])
return result 여기까지는 두 코드 같아보이는데
d2 = d.map(m_transform)
d2.set_format(type='pt')
d2[:1]{'text': ['i didnt feel humiliated'],
'label': tensor([0]),
'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102]]),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
d3 = d.with_transform(w_transform)
d3[:1]{'text': ['i didnt feel humiliated'], 'label': tensor([0]), 'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 26608, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
아래는 미묘하게 다름
d2 = d.map(m_transform)
d2.set_format(type='pt')
d2[:2]{'text': ['i didnt feel humiliated',
'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'],
'label': tensor([0, 0]),
'input_ids': [tensor([ 101, 1045, 2134, 2102, 2514, 26608, 102]),
tensor([ 101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061,
9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998,
2003, 8300, 102])],
'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1]),
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]}
d3 = d.with_transform(w_transform)
d3[:2]ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
#