{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" ] } ], "source": [ "from transformers import AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Encode and Decode" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1350, 492, 151643, 863, 151643]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# treat surface forms of special tokens as actual special tokens\n", "# the default, but unsafe (to be compatible with other projects)\n", "# the same as tokenizer.encode(\"print('<|endoftext|>')<|endoftext|>\", allowed_special='all', disallowed_special=())\n", "tokenizer.encode(\"print('<|endoftext|>')<|endoftext|>\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"print('<|endoftext|>')<|endoftext|>\"" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([1350, 492, 151643, 863, 151643])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# treat texts just as texts, avoid injection attacks\n", "tokenizer.encode(\"print('<|endoftext|>')\", allowed_special=set(), disallowed_special=()) + [tokenizer.eod_id]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"print('<|endoftext|>')<|endoftext|>\"" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "Encountered text corresponding to disallowed special token '<|endoftext|>'.\nIf you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.\nIf you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.\nTo disable this check for all special tokens, pass `disallowed_special=()`.\n", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[7], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39m# treat texts just as texts, avoid injection attacks, and raise error if surface forms of special tokens are ever encountered\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m tokenizer\u001b[39m.\u001b[39;49mencode(\u001b[39m\"\u001b[39;49m\u001b[39mprint(\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m<|endoftext|>\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m)\u001b[39;49m\u001b[39m\"\u001b[39;49m, allowed_special\u001b[39m=\u001b[39;49m\u001b[39mset\u001b[39;49m(), disallowed_special\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mall\u001b[39;49m\u001b[39m'\u001b[39;49m) \u001b[39m+\u001b[39m [tokenizer\u001b[39m.\u001b[39meod_id]\n", "File \u001b[1;32mtransformers\\tokenization_utils_base.py:2348\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode\u001b[1;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, return_tensors, **kwargs)\u001b[0m\n\u001b[0;32m 2311\u001b[0m \u001b[39m@add_end_docstrings\u001b[39m(\n\u001b[0;32m 2312\u001b[0m ENCODE_KWARGS_DOCSTRING,\n\u001b[0;32m 2313\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2331\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[0;32m 2332\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m List[\u001b[39mint\u001b[39m]:\n\u001b[0;32m 2333\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 2334\u001b[0m \u001b[39m Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\u001b[39;00m\n\u001b[0;32m 2335\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2346\u001b[0m \u001b[39m method).\u001b[39;00m\n\u001b[0;32m 2347\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2348\u001b[0m encoded_inputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mencode_plus(\n\u001b[0;32m 2349\u001b[0m text,\n\u001b[0;32m 2350\u001b[0m text_pair\u001b[39m=\u001b[39mtext_pair,\n\u001b[0;32m 2351\u001b[0m add_special_tokens\u001b[39m=\u001b[39madd_special_tokens,\n\u001b[0;32m 2352\u001b[0m padding\u001b[39m=\u001b[39mpadding,\n\u001b[0;32m 2353\u001b[0m truncation\u001b[39m=\u001b[39mtruncation,\n\u001b[0;32m 2354\u001b[0m max_length\u001b[39m=\u001b[39mmax_length,\n\u001b[0;32m 2355\u001b[0m stride\u001b[39m=\u001b[39mstride,\n\u001b[0;32m 2356\u001b[0m return_tensors\u001b[39m=\u001b[39mreturn_tensors,\n\u001b[0;32m 2357\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[0;32m 2358\u001b[0m )\n\u001b[0;32m 2360\u001b[0m \u001b[39mreturn\u001b[39;00m encoded_inputs[\u001b[39m\"\u001b[39m\u001b[39minput_ids\u001b[39m\u001b[39m\"\u001b[39m]\n", "File \u001b[1;32mtransformers\\tokenization_utils_base.py:2756\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[1;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[0;32m 2746\u001b[0m \u001b[39m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[0;32m 2747\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[0;32m 2748\u001b[0m padding\u001b[39m=\u001b[39mpadding,\n\u001b[0;32m 2749\u001b[0m truncation\u001b[39m=\u001b[39mtruncation,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2753\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[0;32m 2754\u001b[0m )\n\u001b[1;32m-> 2756\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_encode_plus(\n\u001b[0;32m 2757\u001b[0m text\u001b[39m=\u001b[39mtext,\n\u001b[0;32m 2758\u001b[0m text_pair\u001b[39m=\u001b[39mtext_pair,\n\u001b[0;32m 2759\u001b[0m add_special_tokens\u001b[39m=\u001b[39madd_special_tokens,\n\u001b[0;32m 2760\u001b[0m padding_strategy\u001b[39m=\u001b[39mpadding_strategy,\n\u001b[0;32m 2761\u001b[0m truncation_strategy\u001b[39m=\u001b[39mtruncation_strategy,\n\u001b[0;32m 2762\u001b[0m max_length\u001b[39m=\u001b[39mmax_length,\n\u001b[0;32m 2763\u001b[0m stride\u001b[39m=\u001b[39mstride,\n\u001b[0;32m 2764\u001b[0m is_split_into_words\u001b[39m=\u001b[39mis_split_into_words,\n\u001b[0;32m 2765\u001b[0m pad_to_multiple_of\u001b[39m=\u001b[39mpad_to_multiple_of,\n\u001b[0;32m 2766\u001b[0m return_tensors\u001b[39m=\u001b[39mreturn_tensors,\n\u001b[0;32m 2767\u001b[0m return_token_type_ids\u001b[39m=\u001b[39mreturn_token_type_ids,\n\u001b[0;32m 2768\u001b[0m return_attention_mask\u001b[39m=\u001b[39mreturn_attention_mask,\n\u001b[0;32m 2769\u001b[0m return_overflowing_tokens\u001b[39m=\u001b[39mreturn_overflowing_tokens,\n\u001b[0;32m 2770\u001b[0m return_special_tokens_mask\u001b[39m=\u001b[39mreturn_special_tokens_mask,\n\u001b[0;32m 2771\u001b[0m return_offsets_mapping\u001b[39m=\u001b[39mreturn_offsets_mapping,\n\u001b[0;32m 2772\u001b[0m return_length\u001b[39m=\u001b[39mreturn_length,\n\u001b[0;32m 2773\u001b[0m verbose\u001b[39m=\u001b[39mverbose,\n\u001b[0;32m 2774\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[0;32m 2775\u001b[0m )\n", "File \u001b[1;32mtransformers\\tokenization_utils.py:649\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus\u001b[1;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[0;32m 640\u001b[0m \u001b[39mif\u001b[39;00m return_offsets_mapping:\n\u001b[0;32m 641\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m(\n\u001b[0;32m 642\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mreturn_offset_mapping is not available when using Python tokenizers. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 643\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mTo use this feature, change your tokenizer to one deriving from \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 646\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mhttps://github.com/huggingface/transformers/pull/2674\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 647\u001b[0m )\n\u001b[1;32m--> 649\u001b[0m first_ids \u001b[39m=\u001b[39m get_input_ids(text)\n\u001b[0;32m 650\u001b[0m second_ids \u001b[39m=\u001b[39m get_input_ids(text_pair) \u001b[39mif\u001b[39;00m text_pair \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 652\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_for_model(\n\u001b[0;32m 653\u001b[0m first_ids,\n\u001b[0;32m 654\u001b[0m pair_ids\u001b[39m=\u001b[39msecond_ids,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 668\u001b[0m verbose\u001b[39m=\u001b[39mverbose,\n\u001b[0;32m 669\u001b[0m )\n", "File \u001b[1;32mtransformers\\tokenization_utils.py:616\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus..get_input_ids\u001b[1;34m(text)\u001b[0m\n\u001b[0;32m 614\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_input_ids\u001b[39m(text):\n\u001b[0;32m 615\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(text, \u001b[39mstr\u001b[39m):\n\u001b[1;32m--> 616\u001b[0m tokens \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenize(text, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 617\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconvert_tokens_to_ids(tokens)\n\u001b[0;32m 618\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(text, (\u001b[39mlist\u001b[39m, \u001b[39mtuple\u001b[39m)) \u001b[39mand\u001b[39;00m \u001b[39mlen\u001b[39m(text) \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39misinstance\u001b[39m(text[\u001b[39m0\u001b[39m], \u001b[39mstr\u001b[39m):\n", "File \u001b[1;32mtokenization_qwen.py:155\u001b[0m, in \u001b[0;36mQWenTokenizer.tokenize\u001b[1;34m(self, text, allowed_special, disallowed_special, **kwargs)\u001b[0m\n\u001b[0;32m 152\u001b[0m text \u001b[39m=\u001b[39m unicodedata\u001b[39m.\u001b[39mnormalize(\u001b[39m\"\u001b[39m\u001b[39mNFC\u001b[39m\u001b[39m\"\u001b[39m, text)\n\u001b[0;32m 154\u001b[0m \u001b[39m# this implementation takes a detour: text -> token id -> token surface forms\u001b[39;00m\n\u001b[1;32m--> 155\u001b[0m \u001b[39mfor\u001b[39;00m t \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtokenizer\u001b[39m.\u001b[39;49mencode(\n\u001b[0;32m 156\u001b[0m text, allowed_special\u001b[39m=\u001b[39;49mallowed_special, disallowed_special\u001b[39m=\u001b[39;49mdisallowed_special\n\u001b[0;32m 157\u001b[0m ):\n\u001b[0;32m 158\u001b[0m tokens\u001b[39m.\u001b[39mappend(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdecoder[t])\n\u001b[0;32m 159\u001b[0m \u001b[39mreturn\u001b[39;00m tokens\n", "File \u001b[1;32mtiktoken\\core.py:117\u001b[0m, in \u001b[0;36mEncoding.encode\u001b[1;34m(self, text, allowed_special, disallowed_special)\u001b[0m\n\u001b[0;32m 115\u001b[0m disallowed_special \u001b[39m=\u001b[39m \u001b[39mfrozenset\u001b[39m(disallowed_special)\n\u001b[0;32m 116\u001b[0m \u001b[39mif\u001b[39;00m match \u001b[39m:=\u001b[39m _special_token_regex(disallowed_special)\u001b[39m.\u001b[39msearch(text):\n\u001b[1;32m--> 117\u001b[0m raise_disallowed_special_token(match\u001b[39m.\u001b[39;49mgroup())\n\u001b[0;32m 119\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 120\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_core_bpe\u001b[39m.\u001b[39mencode(text, allowed_special)\n", "File \u001b[1;32mtiktoken\\core.py:337\u001b[0m, in \u001b[0;36mraise_disallowed_special_token\u001b[1;34m(token)\u001b[0m\n\u001b[0;32m 336\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mraise_disallowed_special_token\u001b[39m(token: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m NoReturn:\n\u001b[1;32m--> 337\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 338\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mEncountered text corresponding to disallowed special token \u001b[39m\u001b[39m{\u001b[39;00mtoken\u001b[39m!r}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 339\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mIf you want this text to be encoded as a special token, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 340\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mpass it to `allowed_special`, e.g. `allowed_special=\u001b[39m\u001b[39m{{\u001b[39;00m\u001b[39m{\u001b[39;00mtoken\u001b[39m!r}\u001b[39;00m\u001b[39m, ...\u001b[39m\u001b[39m}}\u001b[39;00m\u001b[39m`.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 341\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIf you want this text to be encoded as normal text, disable the check for this token \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 342\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mby passing `disallowed_special=(enc.special_tokens_set - \u001b[39m\u001b[39m{{\u001b[39;00m\u001b[39m{\u001b[39;00mtoken\u001b[39m!r}\u001b[39;00m\u001b[39m}}\u001b[39;00m\u001b[39m)`.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 343\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mTo disable this check for all special tokens, pass `disallowed_special=()`.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 344\u001b[0m )\n", "\u001b[1;31mValueError\u001b[0m: Encountered text corresponding to disallowed special token '<|endoftext|>'.\nIf you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.\nIf you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.\nTo disable this check for all special tokens, pass `disallowed_special=()`.\n" ] } ], "source": [ "# treat texts just as texts, avoid injection attacks, and raise error if surface forms of special tokens are ever encountered\n", "tokenizer.encode(\"print('<|endoftext|>')\", allowed_special=set(), disallowed_special='all') + [tokenizer.eod_id]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[151644, 1350, 11146, 91, 15460, 62, 15, 91, 79865, 151645, 151643]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fine-grained control, just keep mind of this:\n", "# allowed_special is treated as special tokens\n", "# disallowed_special raise errors\n", "# allowed_special has higher priority than disallowed_special\n", "tokenizer.encode(\"<|im_start|>print('<|extra_0|>')<|im_end|>\", \n", " allowed_special={'<|im_start|>', '<|im_end|>'}, \n", " disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[151644, 1350, 492, 151646, 863, 151645, 151643]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"<|im_start|>print('<|extra_0|>')<|im_end|>\", \n", " allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, \n", " disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Special Token Management" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using unk_token, but it is not set yet.\n" ] } ], "source": [ "# huggingface tokenizer has its own special token mechanism, so does tiktoken\n", "# we only use the tiktoken mechanism for special tokens, which means many property of huggingface tokenizer will be None\n", "tokenizer.unk_token" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "tokenizer.eos_token_id # use tokenizer.eod_id instead" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "tokenizer.pad_token_id " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "151646" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use one of the extras such as <|extra_0|>\n", "tokenizer.special_tokens['<|extra_0|>']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Utility Methods" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[b'print', b\"('<\", b'|', b'endo', b'ft', b'ext', b'|', b\">')\", '<|endoftext|>']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# special tokens are str, tokens are bytes (since tiktoken operates on the bytes level)\n", "ids = [1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]\n", "tokenizer.convert_ids_to_tokens(ids)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"print('<|endoftext|>')<|endoftext|>\"" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "ids = tokenizer.encode(\"<|im_start|>print('我是一只猫<|extra_0|>')\\n#喵喵喵<|im_end|>\", \n", " allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, \n", " disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['<|im_start|>',\n", " b'print',\n", " b\"('\",\n", " b'\\xe6\\x88\\x91',\n", " b'\\xe6\\x98\\xaf\\xe4\\xb8\\x80',\n", " b'\\xe5\\x8f\\xaa',\n", " b'\\xe7\\x8c\\xab',\n", " '<|extra_0|>',\n", " b\"')\\n\",\n", " b'#',\n", " b'\\xe5\\x96\\xb5',\n", " b'\\xe5\\x96\\xb5',\n", " b'\\xe5\\x96\\xb5',\n", " '<|im_end|>',\n", " '<|endoftext|>']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.convert_ids_to_tokens(ids)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"<|im_start|>print('我是一只猫<|extra_0|>')\\n#喵喵喵<|im_end|><|endoftext|>\"" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'<|extra_204|>'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer._convert_id_to_token(len(tokenizer)-1)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "151850" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer._convert_token_to_id('<|extra_204|>')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Vocabulary Expansion" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [35946, 99639, 91680, 100472], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer(\"我是一只猫\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[99639, 91680, 100472]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"是一只猫\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True, extra_vocab_file=\"qwen_extra.tiktoken\")\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "151857" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tokenizer)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [151854], 'token_type_ids': [0], 'attention_mask': [1]}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer(\"我是一只猫\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'我是一只猫'" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode(tokenizer.encode(\"我是一只猫\"))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[151853]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"是一只猫\")" ] } ], "metadata": { "kernelspec": { "display_name": "python3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }