Python’s token module defines integer constants for the token types produced by the tokenize module. import token. Key constants: token.NAME (name/keyword), token.NUMBER (numeric literal), token.STRING (string/byte literal), token.OP (operator or delimiter), token.COMMENT (comment), token.NEWLINE (logical newline), token.NL (non-logical newline), token.INDENT, token.DEDENT, token.ENDMARKER, token.ERRORTOKEN. Name map: token.tok_name → dict[int, str] (e.g. {1: "NAME", 2: "NUMBER", ...}). Predicates: token.is_terminal(n) → True if n is a terminal symbol (leaf node); token.is_non_terminal(n) → True for grammar rules; token.is_eof(n) → True for ENDMARKER. Exact types: token.EXACT_TYPE_MAP — maps single-char string to precise type constant (e.g. "+" → token.PLUS). Since Python 3.8 each tokenize.TokenInfo also has a exact_type attribute that uses these; e.g. LPAR, RPAR, COMMA, DOT, COLON, SEMI, PLUS, MINUS, STAR, SLASH, PERCENT, EQUAL, etc. The token module is a companion to tokenize — use tokenize to produce the stream and token constants to classify it. Claude Code generates source analyzers, identifier extractors, string literal finders, and token stream processors.
CLAUDE.md for token
## token Stack
- Stdlib: import token, tokenize, io
- Names: token.tok_name[tok_type] # "NAME", "NUMBER", "STRING" ...
- Check: tok.type == token.NAME # is it an identifier/keyword?
- Exact: tok.exact_type == token.LPAR # single-char exact type
- All: token.tok_name.items() # iterate all type codes
- Note: token gives constants; tokenize produces the actual stream
token Token Type Pipeline
# app/tokenutil.py — classify, extract, count, filter, stream processor
from __future__ import annotations
import io
import keyword
import token
import tokenize
from dataclasses import dataclass, field
from typing import Callable, Iterator
# ─────────────────────────────────────────────────────────────────────────────
# 1. Token stream helpers
# ─────────────────────────────────────────────────────────────────────────────
def tokenize_source(source: str) -> list[tokenize.TokenInfo]:
"""
Tokenize a Python source string and return all tokens as a list.
Example:
tokens = tokenize_source("x = 1 + 2")
for tok in tokens:
print(tok)
"""
return list(tokenize.generate_tokens(io.StringIO(source).readline))
def token_type_name(tok_type: int) -> str:
"""
Return the human-readable name for a token type integer.
Example:
print(token_type_name(token.NAME)) # "NAME"
print(token_type_name(token.NUMBER)) # "NUMBER"
"""
return token.tok_name.get(tok_type, f"UNKNOWN({tok_type})")
def describe_token(tok: tokenize.TokenInfo) -> str:
"""
Return a concise description string for a TokenInfo.
Example:
for tok in tokenize_source("x = 1"):
print(describe_token(tok))
"""
type_name = token_type_name(tok.type)
return f"{type_name:12s} {tok.string!r:20s} line {tok.start[0]}:{tok.start[1]}"
# ─────────────────────────────────────────────────────────────────────────────
# 2. Token classifiers
# ─────────────────────────────────────────────────────────────────────────────
def is_name_token(tok: tokenize.TokenInfo) -> bool:
"""Return True if tok is a NAME (identifier or keyword)."""
return tok.type == token.NAME
def is_keyword_token(tok: tokenize.TokenInfo) -> bool:
"""Return True if tok is a Python keyword (NAME + keyword.iskeyword)."""
return tok.type == token.NAME and keyword.iskeyword(tok.string)
def is_identifier_token(tok: tokenize.TokenInfo) -> bool:
"""Return True if tok is a name that is NOT a keyword."""
return tok.type == token.NAME and not keyword.iskeyword(tok.string)
def is_string_token(tok: tokenize.TokenInfo) -> bool:
"""Return True if tok is a string or bytes literal."""
return tok.type == token.STRING
def is_number_token(tok: tokenize.TokenInfo) -> bool:
"""Return True if tok is a numeric literal."""
return tok.type == token.NUMBER
def is_comment_token(tok: tokenize.TokenInfo) -> bool:
"""Return True if tok is a comment (#...)."""
return tok.type == token.COMMENT
def is_operator_token(tok: tokenize.TokenInfo) -> bool:
"""Return True if tok is an operator or delimiter."""
return tok.type == token.OP
# ─────────────────────────────────────────────────────────────────────────────
# 3. Extraction helpers
# ─────────────────────────────────────────────────────────────────────────────
def extract_identifiers(source: str) -> list[str]:
"""
Return all non-keyword NAME tokens (identifiers) in source order.
Example:
ids = extract_identifiers("def greet(name): return f'hi {name}'")
print(ids) # ['greet', 'name', 'name']
"""
tokens = tokenize_source(source)
return [tok.string for tok in tokens if is_identifier_token(tok)]
def extract_string_literals(source: str) -> list[str]:
"""
Return the raw string of each STRING token in source.
Example:
strs = extract_string_literals('x = "hello"\ny = b"bytes"')
print(strs) # ['"hello"', 'b"bytes"']
"""
return [tok.string for tok in tokenize_source(source) if is_string_token(tok)]
def extract_comments(source: str) -> list[str]:
"""
Return all comment text (including the # prefix).
Example:
comments = extract_comments("x = 1 # a comment\n# another")
print(comments)
"""
return [tok.string for tok in tokenize_source(source) if is_comment_token(tok)]
def extract_numbers(source: str) -> list[str]:
"""
Return all numeric literal strings.
Example:
nums = extract_numbers("x = 42\ny = 3.14\nz = 0xFF")
print(nums) # ['42', '3.14', '0xFF']
"""
return [tok.string for tok in tokenize_source(source) if is_number_token(tok)]
def extract_keywords_used(source: str) -> list[str]:
"""
Return all Python keywords used in source (in order, with duplicates).
Example:
kws = extract_keywords_used("def f(): return None if True else False")
print(kws)
"""
return [tok.string for tok in tokenize_source(source) if is_keyword_token(tok)]
# ─────────────────────────────────────────────────────────────────────────────
# 4. Token statistics
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class TokenStats:
"""
Frequency statistics for a Python source string.
Example:
stats = token_stats("def add(a, b): return a + b")
print(stats)
"""
total: int
by_type: dict[str, int] = field(default_factory=dict)
identifiers: dict[str, int] = field(default_factory=dict)
keywords: dict[str, int] = field(default_factory=dict)
def __str__(self) -> str:
lines = [f"TokenStats(total={self.total})"]
for name, count in sorted(self.by_type.items(), key=lambda x: -x[1]):
lines.append(f" {name:15s}: {count}")
return "\n".join(lines)
def token_stats(source: str) -> TokenStats:
"""
Compute token frequency statistics for a source string.
Example:
stats = token_stats(open("mymodule.py").read())
print(stats)
print("Most common identifier:", max(stats.identifiers, key=stats.identifiers.get))
"""
tokens = tokenize_source(source)
by_type: dict[str, int] = {}
identifiers: dict[str, int] = {}
keywords_used: dict[str, int] = {}
total = 0
for tok in tokens:
if tok.type in (token.ENCODING, token.ENDMARKER):
continue
total += 1
type_name = token_type_name(tok.type)
by_type[type_name] = by_type.get(type_name, 0) + 1
if is_identifier_token(tok):
identifiers[tok.string] = identifiers.get(tok.string, 0) + 1
elif is_keyword_token(tok):
keywords_used[tok.string] = keywords_used.get(tok.string, 0) + 1
return TokenStats(
total=total,
by_type=by_type,
identifiers=identifiers,
keywords=keywords_used,
)
# ─────────────────────────────────────────────────────────────────────────────
# 5. Token stream filter / transformer
# ─────────────────────────────────────────────────────────────────────────────
def filter_tokens(
source: str,
predicate: Callable[[tokenize.TokenInfo], bool],
) -> list[tokenize.TokenInfo]:
"""
Return only the tokens for which predicate returns True.
Example:
ops = filter_tokens(source, is_operator_token)
all_names = filter_tokens(source, is_name_token)
"""
return [tok for tok in tokenize_source(source) if predicate(tok)]
def strip_comments(source: str) -> str:
"""
Return source with all comment tokens removed.
Uses token round-trip via tokenize.untokenize.
Example:
clean = strip_comments("x = 1 # remove me\n# also me\ny = 2\n")
print(clean)
"""
tokens = tokenize_source(source)
filtered = [tok for tok in tokens if tok.type != token.COMMENT]
return tokenize.untokenize(filtered)
def rename_identifier(source: str, old_name: str, new_name: str) -> str:
"""
Replace all occurrences of identifier old_name with new_name in source.
Only renames non-keyword NAME tokens that exactly match old_name.
Example:
new_src = rename_identifier("x = old + old\n", "old", "renamed")
print(new_src) # x = renamed + renamed
"""
tokens = tokenize_source(source)
def transform(tok: tokenize.TokenInfo) -> tokenize.TokenInfo:
if is_identifier_token(tok) and tok.string == old_name:
return tok._replace(string=new_name)
return tok
new_tokens = [transform(tok) for tok in tokens]
return tokenize.untokenize(new_tokens)
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== token demo ===")
# ── tok_name dict ─────────────────────────────────────────────────────────
print("\n--- tok_name ---")
for code in [token.NAME, token.NUMBER, token.STRING, token.OP,
token.COMMENT, token.NEWLINE, token.INDENT, token.DEDENT,
token.ENDMARKER]:
print(f" {code:3d} {token.tok_name[code]}")
source = (
"def greet(name: str) -> str:\n"
" # Return a greeting\n"
" count = 42\n"
" return f'Hello, {name}!'\n"
)
# ── tokenize_source ────────────────────────────────────────────────────────
print("\n--- tokenize_source (first 10 tokens) ---")
for tok in tokenize_source(source)[:10]:
print(f" {describe_token(tok)}")
# ── extract helpers ────────────────────────────────────────────────────────
print("\n--- extract_identifiers ---")
print(f" {extract_identifiers(source)}")
print("\n--- extract_keywords_used ---")
print(f" {extract_keywords_used(source)}")
print("\n--- extract_comments ---")
print(f" {extract_comments(source)}")
print("\n--- extract_numbers ---")
print(f" {extract_numbers(source)}")
# ── token_stats ────────────────────────────────────────────────────────────
print("\n--- token_stats ---")
stats = token_stats(source)
print(stats)
# ── strip_comments ────────────────────────────────────────────────────────
print("\n--- strip_comments ---")
clean = strip_comments(source)
for line in clean.splitlines():
print(f" {line!r}")
# ── rename_identifier ─────────────────────────────────────────────────────
print("\n--- rename_identifier ---")
renamed = rename_identifier("x = old + old * 2\n", "old", "fresh")
print(f" {renamed!r}")
print("\n=== done ===")
For the tokenize alternative — tokenize.generate_tokens(readline) produces the full TokenInfo stream including exact positions and continuation lines — use tokenize to produce the stream; use token constants to classify each token type in the stream. They are complementary, not alternatives — token is the constant catalogue; tokenize is the stream producer. For the ast.parse alternative — ast.parse(source) builds a full AST with typed nodes (ast.Name, ast.Constant, ast.FunctionDef, etc.) — use ast when you need to understand the structure of the code (call sites, class hierarchy, expression trees); use tokenize/token when you need purely lexical analysis (comments, whitespace preservation, exact token positions) without caring about parse tree structure. The Claude Skills 360 bundle includes token skill sets covering tokenize_source()/token_type_name()/describe_token() stream helpers, is_name_token()/is_keyword_token()/is_identifier_token()/is_string_token()/is_number_token()/is_comment_token()/is_operator_token() classifiers, extract_identifiers()/extract_string_literals()/extract_comments()/extract_numbers()/extract_keywords_used() extractors, TokenStats with by_type/identifiers/keywords dicts, filter_tokens()/strip_comments()/rename_identifier() transformers. Start with the free tier to try token stream patterns and token pipeline code generation.