 
            
            
            
            
                2 Apr
                
                    2022
                
            
            
                2 Apr
                
                '22
                
            
            
            
        
    
                1:13 p.m.
            
        # Karl wants to learn around this first block but [doesn't understand that his working memory doesn't have space for it under these conditions right now], so it is commented out. #from tokenizers.pre_tokenizers import Whitespace # #tokenizer.pre_tokenizer = Whitespace() # this ihe training bit for using the example code: files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]] tokenizer.train(files, trainer)