added grid_size parameter to generate a grid of images

This commit is contained in:
Brett Kuprel
2022-07-02 08:45:49 -04:00
parent e0386f991c
commit 1eb56737d8
6 changed files with 87 additions and 69 deletions
+4 -6
View File
@@ -1,15 +1,13 @@
from math import inf
from typing import List, Tuple
class TextTokenizer:
def __init__(self, vocab: dict, merges: List[str], is_verbose: bool = True):
def __init__(self, vocab: dict, merges: list[str], is_verbose: bool = True):
self.is_verbose = is_verbose
self.token_from_subword = vocab
pairs = [tuple(pair.split()) for pair in merges]
self.rank_from_pair = dict(zip(pairs, range(len(pairs))))
def tokenize(self, text: str) -> List[int]:
def tokenize(self, text: str) -> list[int]:
sep_token = self.token_from_subword['</s>']
cls_token = self.token_from_subword['<s>']
unk_token = self.token_from_subword['<unk>']
@@ -21,8 +19,8 @@ class TextTokenizer:
]
return [cls_token] + tokens + [sep_token]
def get_byte_pair_encoding(self, word: str) -> List[str]:
def get_pair_rank(pair: Tuple[str, str]) -> int:
def get_byte_pair_encoding(self, word: str) -> list[str]:
def get_pair_rank(pair: tuple[str, str]) -> int:
return self.rank_from_pair.get(pair, inf)
subwords = [chr(ord(" ") + 256)] + list(word)