1. coco-tiny – 100점
coco = datasets.load_dataset("guebin/coco-tiny" )
coco
DatasetDict({
train: Dataset({
features: ['image_id', 'coco_url', 'image', 'caption', 'recaption'],
num_rows: 10
})
})
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased" )
tokenizer
DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False), added_tokens_decoder={
0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
(1) .map() 을 이용하여 coco dataset의 caption column을 토크나이징하라. 세부지침을 따르라
세부지침
변환후에는 [‘image_id’, ‘coco_url’, ‘image’, ‘caption’, ‘recaption’, ‘input_ids’, ‘attention_mask’] 이 포함되어야 한다.
input_ids 와 attention_mask는 batch_size=10을 기준으로 패딩되어있어야 한다.
즉 변환이후 아래가 실행되어야 한다.
{'train': ['image_id',
'coco_url',
'image',
'caption',
'recaption',
'input_ids',
'attention_mask']}
torch.tensor(coco['train' ]['input_ids' ]).shape
torch.tensor(coco['train' ]['attention_mask' ]).shape
그리고 결과에서 input_ids 와 attention_mask를 제외하여 이 토크나이징을 결과를 삭제하라.
(풀이)
coco = coco.map (
lambda dct: tokenizer(dct['caption' ],padding= True ),
batched= True ,
batch_size= 10
)
coco = coco.remove_columns(['input_ids' ,'attention_mask' ])
(2) image column의 각 observation(=example)을 텐서화하고 shape을 조사해라. 모든 이미지가 동일한 사이즈를 가지는가? 모든 이미지는 칼라이미지인가? 이번에는 image column의 각 observation(=example)에 .convert("RGB")를 적용하고 shape을 조사해라. 모든 이미지가 동일한 사이즈를 가지는가? 모든 이미지는 칼라이미지인가?
(풀이)
to_tensor = torchvision.transforms.ToTensor()
[l.shape for l in list (map (to_tensor,coco['train' ]['image' ]))]
[torch.Size([3, 453, 640]),
torch.Size([3, 612, 612]),
torch.Size([3, 480, 640]),
torch.Size([3, 500, 375]),
torch.Size([3, 427, 640]),
torch.Size([3, 480, 640]),
torch.Size([3, 338, 500]),
torch.Size([1, 427, 640]),
torch.Size([3, 375, 500]),
torch.Size([3, 640, 426])]
compose = torchvision.transforms.Compose([
lambda x: x.convert("RGB" ),
torchvision.transforms.ToTensor()
])
[l.shape for l in list (map (compose,coco['train' ]['image' ]))]
[torch.Size([3, 453, 640]),
torch.Size([3, 612, 612]),
torch.Size([3, 480, 640]),
torch.Size([3, 500, 375]),
torch.Size([3, 427, 640]),
torch.Size([3, 480, 640]),
torch.Size([3, 338, 500]),
torch.Size([3, 427, 640]),
torch.Size([3, 375, 500]),
torch.Size([3, 640, 426])]
(3) .with_transfrom()을 이용하여 coco 데이터셋의 image column 에 아래와 같은 변환을 적용하라.
image column 의 각 observation(=example)에 .convert("RGB")를 적용하라.
1의 결과를 텐서화 하라.
2의 결과를 (64,64)로 Resize 하라.
결과를 pixel_values column 에 추가하라.
출력결과는 아래와 같아야 한다. (칼럼에는 image_id, coco_url, image, caption, recaption, pixel_values 이 포함되어 있으며 pixel_values는 4차원 텐서의 형태이다)
{'image_id': [53120],
'coco_url': ['http://images.cocodataset.org/val2014/COCO_val2014_000000053120.jpg'],
'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x453>],
'caption': ['Several horses with riders crossing a body of water.'],
'recaption': ['A group of people is riding horses along a sandy beach near a body of water. The horses are walking in the shallow water, and the riders are wearing blue and red clothing. The beach is surrounded by trees, and there is a wooden post visible in the foreground.'],
'pixel_values': tensor([[[[0.1884, 0.2323, 0.2613, ..., 0.3255, 0.3241, 0.3079],
[0.1864, 0.2289, 0.2521, ..., 0.3119, 0.3239, 0.3061],
[0.1936, 0.2166, 0.2491, ..., 0.3155, 0.3226, 0.2973],
...,
[0.6029, 0.6010, 0.5950, ..., 0.7389, 0.7281, 0.7246],
[0.6365, 0.6395, 0.6513, ..., 0.7168, 0.7107, 0.6726],
[0.6503, 0.6531, 0.6510, ..., 0.7534, 0.7166, 0.6811]],
[[0.2061, 0.2484, 0.2660, ..., 0.3418, 0.3342, 0.3343],
[0.2077, 0.2467, 0.2561, ..., 0.3358, 0.3306, 0.3263],
[0.2126, 0.2379, 0.2553, ..., 0.3404, 0.3315, 0.3112],
...,
[0.6330, 0.6341, 0.6259, ..., 0.7595, 0.7462, 0.7359],
[0.6780, 0.6830, 0.6969, ..., 0.7413, 0.7327, 0.6938],
[0.6920, 0.7049, 0.6990, ..., 0.7847, 0.7436, 0.7001]],
[[0.2549, 0.2798, 0.3033, ..., 0.3978, 0.3955, 0.3942],
[0.2577, 0.2820, 0.2963, ..., 0.3957, 0.3913, 0.3909],
[0.2626, 0.2807, 0.2940, ..., 0.3909, 0.3880, 0.3827],
...,
[0.7402, 0.7438, 0.7295, ..., 0.8474, 0.8322, 0.8150],
[0.7877, 0.7875, 0.7959, ..., 0.7985, 0.7764, 0.7155],
[0.7794, 0.7862, 0.7845, ..., 0.8482, 0.7953, 0.7360]]]])}
그리고 이 변환을 취소하는 코드를 작성하라.
(풀이)
compose = torchvision.transforms.Compose([
lambda x: x.convert("RGB" ),
torchvision.transforms.ToTensor(),
torchvision.transforms.Resize((64 ,64 ))
])
def w_transform(examples):
examples['pixel_values' ] = torch.stack(list (map (compose,examples['image' ])))
return examples
coco = coco.with_transform(w_transform)
coco['train' ][:1 ]
{'image_id': [53120],
'coco_url': ['http://images.cocodataset.org/val2014/COCO_val2014_000000053120.jpg'],
'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x453>],
'caption': ['Several horses with riders crossing a body of water.'],
'recaption': ['A group of people is riding horses along a sandy beach near a body of water. The horses are walking in the shallow water, and the riders are wearing blue and red clothing. The beach is surrounded by trees, and there is a wooden post visible in the foreground.'],
'pixel_values': tensor([[[[0.1913, 0.2348, 0.2648, ..., 0.3231, 0.3242, 0.3105],
[0.1917, 0.2333, 0.2552, ..., 0.3147, 0.3210, 0.3056],
[0.1973, 0.2218, 0.2492, ..., 0.3149, 0.3175, 0.2976],
...,
[0.6069, 0.6036, 0.5903, ..., 0.7369, 0.7262, 0.7286],
[0.6336, 0.6364, 0.6470, ..., 0.7202, 0.7085, 0.6731],
[0.6504, 0.6513, 0.6491, ..., 0.7540, 0.7168, 0.6772]],
[[0.2053, 0.2441, 0.2653, ..., 0.3427, 0.3320, 0.3313],
[0.2058, 0.2429, 0.2560, ..., 0.3344, 0.3290, 0.3261],
[0.2125, 0.2375, 0.2561, ..., 0.3373, 0.3318, 0.3115],
...,
[0.6305, 0.6317, 0.6258, ..., 0.7585, 0.7478, 0.7345],
[0.6790, 0.6840, 0.6975, ..., 0.7387, 0.7337, 0.6942],
[0.6894, 0.7040, 0.6990, ..., 0.7839, 0.7428, 0.7009]],
[[0.2564, 0.2870, 0.3125, ..., 0.3988, 0.3970, 0.3955],
[0.2568, 0.2858, 0.3021, ..., 0.3903, 0.3937, 0.3906],
[0.2608, 0.2816, 0.3016, ..., 0.3875, 0.3889, 0.3807],
...,
[0.7436, 0.7437, 0.7317, ..., 0.8488, 0.8293, 0.8157],
[0.7874, 0.7897, 0.7997, ..., 0.7990, 0.7746, 0.7159],
[0.7784, 0.7845, 0.7870, ..., 0.8497, 0.7943, 0.7377]]]])}
(4) 아래의 url에 저장된 이미지를
"http://images.cocodataset.org/val2014/COCO_val2014_000000053120.jpg"
(3)과 같은 텐서형태로 변환하는 코드를 작성하라.
(잘못푼풀이) – 이거 제가 해설할때 이렇게 풀었는데요, 잘못풀었습니다.
url = "http://images.cocodataset.org/val2014/COCO_val2014_000000053120.jpg"
response = requests.get(url)
image = Image.open (BytesIO(response.content))
torchvision.transforms.ToTensor()(image)
tensor([[[0.1569, 0.1843, 0.1765, ..., 0.2745, 0.2784, 0.2824],
[0.1765, 0.1647, 0.1725, ..., 0.3098, 0.3137, 0.2941],
[0.1922, 0.1647, 0.1765, ..., 0.3098, 0.3216, 0.3098],
...,
[0.7020, 0.7098, 0.6980, ..., 0.8235, 0.8275, 0.8196],
[0.5020, 0.5216, 0.5059, ..., 0.7961, 0.7922, 0.7922],
[0.5451, 0.5294, 0.5373, ..., 0.7725, 0.7882, 0.7765]],
[[0.2039, 0.2196, 0.2039, ..., 0.2980, 0.3020, 0.3098],
[0.2078, 0.1882, 0.1961, ..., 0.3373, 0.3412, 0.3255],
[0.2039, 0.1765, 0.1882, ..., 0.3412, 0.3529, 0.3412],
...,
[0.7804, 0.8039, 0.7843, ..., 0.8549, 0.8510, 0.8431],
[0.5373, 0.5686, 0.5490, ..., 0.8314, 0.8275, 0.8275],
[0.5725, 0.5804, 0.5882, ..., 0.7961, 0.8235, 0.8235]],
[[0.2510, 0.2784, 0.2745, ..., 0.4000, 0.4039, 0.3843],
[0.2588, 0.2510, 0.2588, ..., 0.4078, 0.4118, 0.3765],
[0.2627, 0.2353, 0.2549, ..., 0.3843, 0.3961, 0.3843],
...,
[0.8157, 0.8431, 0.8392, ..., 0.9059, 0.9137, 0.9059],
[0.5961, 0.6314, 0.6196, ..., 0.8980, 0.8863, 0.8863],
[0.6353, 0.6431, 0.6549, ..., 0.8824, 0.8902, 0.8706]]])
이거 제가 해설할때 문제를 잘못봐서 잘못풀었습니다. 하루지나서 해설하니까 문제 낸게 헷갈리네요. (3)와 같은 텐서로 나타내야 하니까 텐서화 이후에 Resize까지 적용하여 아래와 같이 풀이로 해야합니다. 혼란드려 죄송합니다.
(올바른풀이)
url = "http://images.cocodataset.org/val2014/COCO_val2014_000000053120.jpg"
response = requests.get(url)
image = Image.open (BytesIO(response.content))
image_tensor = torchvision.transforms.ToTensor()(image)
#---# 여기가 끝이 아니고 아래처럼 resize까지 해줘야함
image_tensor_resized = torchvision.transforms.Resize((64 ,64 ))(image_tensor)
image_tensor_resized
tensor([[[0.1884, 0.2323, 0.2613, ..., 0.3255, 0.3241, 0.3079],
[0.1864, 0.2289, 0.2521, ..., 0.3119, 0.3239, 0.3061],
[0.1936, 0.2166, 0.2491, ..., 0.3155, 0.3226, 0.2973],
...,
[0.6029, 0.6010, 0.5950, ..., 0.7389, 0.7281, 0.7246],
[0.6365, 0.6395, 0.6513, ..., 0.7168, 0.7107, 0.6726],
[0.6503, 0.6531, 0.6510, ..., 0.7534, 0.7166, 0.6811]],
[[0.2061, 0.2484, 0.2660, ..., 0.3418, 0.3342, 0.3343],
[0.2077, 0.2467, 0.2561, ..., 0.3358, 0.3306, 0.3263],
[0.2126, 0.2379, 0.2553, ..., 0.3404, 0.3315, 0.3112],
...,
[0.6330, 0.6341, 0.6259, ..., 0.7595, 0.7462, 0.7359],
[0.6780, 0.6830, 0.6969, ..., 0.7413, 0.7327, 0.6938],
[0.6920, 0.7049, 0.6990, ..., 0.7847, 0.7436, 0.7001]],
[[0.2549, 0.2798, 0.3033, ..., 0.3978, 0.3955, 0.3942],
[0.2577, 0.2820, 0.2963, ..., 0.3957, 0.3913, 0.3909],
[0.2626, 0.2807, 0.2940, ..., 0.3909, 0.3880, 0.3827],
...,
[0.7402, 0.7438, 0.7295, ..., 0.8474, 0.8322, 0.8150],
[0.7877, 0.7875, 0.7959, ..., 0.7985, 0.7764, 0.7155],
[0.7794, 0.7862, 0.7845, ..., 0.8482, 0.7953, 0.7360]]])
혹은
url = "http://images.cocodataset.org/val2014/COCO_val2014_000000053120.jpg"
response = requests.get(url)
image = Image.open (BytesIO(response.content))
compose(image) # 문제(3)에서 선언한 compose 재활용, 실제로는 compose에 RGB화 시키는 코드도 함께 있지만 이 케이스에서 이 변환은 항등변환이됨 (이미 칼라이미지니까)
tensor([[[0.1884, 0.2323, 0.2613, ..., 0.3255, 0.3241, 0.3079],
[0.1864, 0.2289, 0.2521, ..., 0.3119, 0.3239, 0.3061],
[0.1936, 0.2166, 0.2491, ..., 0.3155, 0.3226, 0.2973],
...,
[0.6029, 0.6010, 0.5950, ..., 0.7389, 0.7281, 0.7246],
[0.6365, 0.6395, 0.6513, ..., 0.7168, 0.7107, 0.6726],
[0.6503, 0.6531, 0.6510, ..., 0.7534, 0.7166, 0.6811]],
[[0.2061, 0.2484, 0.2660, ..., 0.3418, 0.3342, 0.3343],
[0.2077, 0.2467, 0.2561, ..., 0.3358, 0.3306, 0.3263],
[0.2126, 0.2379, 0.2553, ..., 0.3404, 0.3315, 0.3112],
...,
[0.6330, 0.6341, 0.6259, ..., 0.7595, 0.7462, 0.7359],
[0.6780, 0.6830, 0.6969, ..., 0.7413, 0.7327, 0.6938],
[0.6920, 0.7049, 0.6990, ..., 0.7847, 0.7436, 0.7001]],
[[0.2549, 0.2798, 0.3033, ..., 0.3978, 0.3955, 0.3942],
[0.2577, 0.2820, 0.2963, ..., 0.3957, 0.3913, 0.3909],
[0.2626, 0.2807, 0.2940, ..., 0.3909, 0.3880, 0.3827],
...,
[0.7402, 0.7438, 0.7295, ..., 0.8474, 0.8322, 0.8150],
[0.7877, 0.7875, 0.7959, ..., 0.7985, 0.7764, 0.7155],
[0.7794, 0.7862, 0.7845, ..., 0.8482, 0.7953, 0.7360]]])
(5) .with_transfrom()을 이용하여 coco_url column 의 각 observation(=example)에 (4)와 같은 변환을 적용하라.
출력결과는 아래와 같아야한다. (칼럼에는 pixel_values 만 포함되어있으며 이는 4차원 텐서의 형태이다.)
{'pixel_values': tensor([[[[0.1884, 0.2323, 0.2613, ..., 0.3255, 0.3241, 0.3079],
[0.1864, 0.2289, 0.2521, ..., 0.3119, 0.3239, 0.3061],
[0.1936, 0.2166, 0.2491, ..., 0.3155, 0.3226, 0.2973],
...,
[0.6029, 0.6010, 0.5950, ..., 0.7389, 0.7281, 0.7246],
[0.6365, 0.6395, 0.6513, ..., 0.7168, 0.7107, 0.6726],
[0.6503, 0.6531, 0.6510, ..., 0.7534, 0.7166, 0.6811]],
[[0.2061, 0.2484, 0.2660, ..., 0.3418, 0.3342, 0.3343],
[0.2077, 0.2467, 0.2561, ..., 0.3358, 0.3306, 0.3263],
[0.2126, 0.2379, 0.2553, ..., 0.3404, 0.3315, 0.3112],
...,
[0.6330, 0.6341, 0.6259, ..., 0.7595, 0.7462, 0.7359],
[0.6780, 0.6830, 0.6969, ..., 0.7413, 0.7327, 0.6938],
[0.6920, 0.7049, 0.6990, ..., 0.7847, 0.7436, 0.7001]],
[[0.2549, 0.2798, 0.3033, ..., 0.3978, 0.3955, 0.3942],
[0.2577, 0.2820, 0.2963, ..., 0.3957, 0.3913, 0.3909],
[0.2626, 0.2807, 0.2940, ..., 0.3909, 0.3880, 0.3827],
...,
[0.7402, 0.7438, 0.7295, ..., 0.8474, 0.8322, 0.8150],
[0.7877, 0.7875, 0.7959, ..., 0.7985, 0.7764, 0.7155],
[0.7794, 0.7862, 0.7845, ..., 0.8482, 0.7953, 0.7360]]]])}
그리고 이 변환을 취소하는 코드를 작성하라.
(풀이)
compose = torchvision.transforms.Compose([
lambda x: requests.get(x),
lambda y: Image.open (BytesIO(y.content)),
lambda z: z.convert("RGB" ),
torchvision.transforms.ToTensor(),
torchvision.transforms.Resize((64 ,64 ))
])
def w_transform(examples):
dct = dict ()
dct['pixel_values' ] = torch.stack(list (map (compose,examples['coco_url' ])))
return dct
coco = coco.with_transform(w_transform)
coco['train' ][:1 ]
{'pixel_values': tensor([[[[0.1884, 0.2323, 0.2613, ..., 0.3255, 0.3241, 0.3079],
[0.1864, 0.2289, 0.2521, ..., 0.3119, 0.3239, 0.3061],
[0.1936, 0.2166, 0.2491, ..., 0.3155, 0.3226, 0.2973],
...,
[0.6029, 0.6010, 0.5950, ..., 0.7389, 0.7281, 0.7246],
[0.6365, 0.6395, 0.6513, ..., 0.7168, 0.7107, 0.6726],
[0.6503, 0.6531, 0.6510, ..., 0.7534, 0.7166, 0.6811]],
[[0.2061, 0.2484, 0.2660, ..., 0.3418, 0.3342, 0.3343],
[0.2077, 0.2467, 0.2561, ..., 0.3358, 0.3306, 0.3263],
[0.2126, 0.2379, 0.2553, ..., 0.3404, 0.3315, 0.3112],
...,
[0.6330, 0.6341, 0.6259, ..., 0.7595, 0.7462, 0.7359],
[0.6780, 0.6830, 0.6969, ..., 0.7413, 0.7327, 0.6938],
[0.6920, 0.7049, 0.6990, ..., 0.7847, 0.7436, 0.7001]],
[[0.2549, 0.2798, 0.3033, ..., 0.3978, 0.3955, 0.3942],
[0.2577, 0.2820, 0.2963, ..., 0.3957, 0.3913, 0.3909],
[0.2626, 0.2807, 0.2940, ..., 0.3909, 0.3880, 0.3827],
...,
[0.7402, 0.7438, 0.7295, ..., 0.8474, 0.8322, 0.8150],
[0.7877, 0.7875, 0.7959, ..., 0.7985, 0.7764, 0.7155],
[0.7794, 0.7862, 0.7845, ..., 0.8482, 0.7953, 0.7360]]]])}