scripts-boolean-parser
#!/usr/bin/env python3 """
title: Boolean Search Parser for /cxq component_type: script version: 1.0.0 status: active summary: Boolean operator parser for FTS5 search queries (ADR-149, J.4.4.1-J.4.4.3) keywords: [cxq, query, boolean, search, parser, fts5, AND, OR, NOT] track: J task_id: J.4.4.1 created: 2026-02-04
Boolean Search Parser - J.4.4.1 Implementation
Parses user-friendly Boolean search expressions and converts them to FTS5-compatible syntax.
Supports:
- Boolean operators: AND, OR, NOT (case-insensitive)
- Phrase search: "exact phrase" (J.4.4.2)
- Wildcards: prefix*, *suffix, contains (J.4.4.3)
- Parentheses for grouping: (term1 OR term2) AND term3
FTS5 Syntax Mapping:
- User:
term1 AND term2→ FTS5:term1 term2 - User:
term1 OR term2→ FTS5:term1 OR term2 - User:
NOT term→ FTS5:-term - User:
term1 AND NOT term2→ FTS5:term1 -term2 - User:
"exact phrase"→ FTS5:"exact phrase" - User:
prefix*→ FTS5:prefix*
Examples: from scripts.context_graph.boolean_parser import BooleanSearchParser
parser = BooleanSearchParser()
# Basic Boolean
result = parser.parse("error AND authentication")
# result.fts_query = "error authentication"
# OR and NOT
result = parser.parse("deploy OR kubernetes NOT test")
# result.fts_query = "deploy OR kubernetes -test"
# Phrases
result = parser.parse('"database connection" AND timeout')
# result.fts_query = '"database connection" timeout'
# Complex
result = parser.parse('(error OR warning) AND "stack trace" NOT debug')
# result.fts_query = '(error OR warning) "stack trace" -debug'
Usage in /cxq: /cxq "error AND authentication" /cxq "deploy OR kubernetes" /cxq '"exact phrase" AND term' /cxq "prefix* AND *suffix" """
import re import logging from dataclasses import dataclass, field from typing import List, Optional, Set from enum import Enum, auto
logger = logging.getLogger(name)
=============================================================================
Token Types
=============================================================================
class TokenType(Enum): """Token types for Boolean search parsing.""" TERM = auto() # Regular search term PHRASE = auto() # Quoted phrase "like this" AND = auto() # AND operator OR = auto() # OR operator NOT = auto() # NOT operator LPAREN = auto() # ( RPAREN = auto() # ) WILDCARD = auto() # term* or *term or term EOF = auto() # End of input
@dataclass class Token: """A single token from the search query.""" type: TokenType value: str position: int = 0
=============================================================================
Parse Result
=============================================================================
@dataclass class BooleanParseResult: """Result of parsing a Boolean search expression.""" fts_query: str original: str success: bool error: Optional[str] = None terms: List[str] = field(default_factory=list) phrases: List[str] = field(default_factory=list) negated_terms: List[str] = field(default_factory=list) has_wildcards: bool = False
def __str__(self) -> str:
if self.success:
return f"FTS5: {self.fts_query}"
return f"Error: {self.error}"
=============================================================================
Lexer (Tokenizer)
=============================================================================
class BooleanLexer: """Tokenize Boolean search expressions."""
# Keywords (case-insensitive)
KEYWORDS = {
'AND': TokenType.AND,
'OR': TokenType.OR,
'NOT': TokenType.NOT,
}
def __init__(self, text: str):
self.text = text
self.pos = 0
self.length = len(text)
def tokenize(self) -> List[Token]:
"""Tokenize the entire input."""
tokens = []
while self.pos < self.length:
token = self._next_token()
if token:
tokens.append(token)
tokens.append(Token(TokenType.EOF, '', self.pos))
return tokens
def _next_token(self) -> Optional[Token]:
"""Get the next token."""
self._skip_whitespace()
if self.pos >= self.length:
return None
char = self.text[self.pos]
start_pos = self.pos
# Parentheses
if char == '(':
self.pos += 1
return Token(TokenType.LPAREN, '(', start_pos)
if char == ')':
self.pos += 1
return Token(TokenType.RPAREN, ')', start_pos)
# Quoted phrase
if char in '"\'':
return self._read_phrase(char)
# Word (term, keyword, or wildcard)
return self._read_word()
def _skip_whitespace(self):
"""Skip whitespace characters."""
while self.pos < self.length and self.text[self.pos].isspace():
self.pos += 1
def _read_phrase(self, quote_char: str) -> Token:
"""Read a quoted phrase."""
start_pos = self.pos
self.pos += 1 # Skip opening quote
phrase_start = self.pos
while self.pos < self.length:
if self.text[self.pos] == quote_char:
phrase = self.text[phrase_start:self.pos]
self.pos += 1 # Skip closing quote
return Token(TokenType.PHRASE, phrase, start_pos)
self.pos += 1
# Unclosed quote - treat rest as phrase
phrase = self.text[phrase_start:]
return Token(TokenType.PHRASE, phrase, start_pos)
def _read_word(self) -> Token:
"""Read a word (term, keyword, or wildcard)."""
start_pos = self.pos
word = []
has_wildcard = False
while self.pos < self.length:
char = self.text[self.pos]
# Stop on whitespace or parentheses
if char.isspace() or char in '()':
break
# Stop on quotes (start of phrase)
if char in '"\'':
break
# Track wildcards
if char == '*':
has_wildcard = True
word.append(char)
self.pos += 1
word_str = ''.join(word)
word_upper = word_str.upper()
# Check for keywords (but not if they contain wildcards)
if not has_wildcard and word_upper in self.KEYWORDS:
return Token(self.KEYWORDS[word_upper], word_str, start_pos)
# Wildcard term
if has_wildcard:
return Token(TokenType.WILDCARD, word_str, start_pos)
# Regular term
return Token(TokenType.TERM, word_str, start_pos)
=============================================================================
Parser
=============================================================================
class BooleanSearchParser: """ Parse Boolean search expressions and convert to FTS5 syntax.
Grammar:
query → or_expr
or_expr → and_expr (OR and_expr)*
and_expr → not_expr (AND? not_expr)*
not_expr → NOT? primary
primary → TERM | PHRASE | WILDCARD | '(' or_expr ')'
"""
def __init__(self):
self.tokens: List[Token] = []
self.pos = 0
self.terms: List[str] = []
self.phrases: List[str] = []
self.negated_terms: List[str] = []
self.has_wildcards = False
def parse(self, query: str) -> BooleanParseResult:
"""Parse a Boolean search query and return FTS5-compatible syntax."""
if not query or not query.strip():
return BooleanParseResult(
fts_query='',
original=query,
success=False,
error='Empty query'
)
try:
# Reset state
self.terms = []
self.phrases = []
self.negated_terms = []
self.has_wildcards = False
# Tokenize
lexer = BooleanLexer(query)
self.tokens = lexer.tokenize()
self.pos = 0
# Parse
fts_query = self._parse_or_expr()
return BooleanParseResult(
fts_query=fts_query.strip(),
original=query,
success=True,
terms=self.terms.copy(),
phrases=self.phrases.copy(),
negated_terms=self.negated_terms.copy(),
has_wildcards=self.has_wildcards
)
except Exception as e:
logger.error(f"Boolean parse error: {e}")
return BooleanParseResult(
fts_query=query, # Fall back to original
original=query,
success=False,
error=str(e)
)
def _current(self) -> Token:
"""Get current token."""
if self.pos < len(self.tokens):
return self.tokens[self.pos]
return Token(TokenType.EOF, '', -1)
def _advance(self) -> Token:
"""Advance to next token and return current."""
token = self._current()
self.pos += 1
return token
def _match(self, *types: TokenType) -> bool:
"""Check if current token matches any of the given types."""
return self._current().type in types
def _parse_or_expr(self) -> str:
"""Parse OR expression: and_expr (OR and_expr)*"""
left = self._parse_and_expr()
while self._match(TokenType.OR):
self._advance() # Consume OR
right = self._parse_and_expr()
left = f"{left} OR {right}"
return left
def _parse_and_expr(self) -> str:
"""Parse AND expression: not_expr (AND? not_expr)*"""
left = self._parse_not_expr()
while True:
# Explicit AND
if self._match(TokenType.AND):
self._advance() # Consume AND
right = self._parse_not_expr()
left = f"{left} {right}" # FTS5 implicit AND
# Implicit AND (term follows term without operator)
elif self._match(TokenType.TERM, TokenType.PHRASE, TokenType.WILDCARD,
TokenType.NOT, TokenType.LPAREN):
right = self._parse_not_expr()
left = f"{left} {right}"
else:
break
return left
def _parse_not_expr(self) -> str:
"""Parse NOT expression: NOT? primary"""
if self._match(TokenType.NOT):
self._advance() # Consume NOT
primary = self._parse_primary()
# Track negated term
if primary and not primary.startswith('('):
clean = primary.strip('"')
if clean not in self.negated_terms:
self.negated_terms.append(clean)
return f"-{primary}"
return self._parse_primary()
def _parse_primary(self) -> str:
"""Parse primary: TERM | PHRASE | WILDCARD | '(' or_expr ')'"""
token = self._current()
if token.type == TokenType.TERM:
self._advance()
if token.value not in self.terms:
self.terms.append(token.value)
return token.value
if token.type == TokenType.PHRASE:
self._advance()
if token.value not in self.phrases:
self.phrases.append(token.value)
return f'"{token.value}"'
if token.type == TokenType.WILDCARD:
self._advance()
self.has_wildcards = True
return token.value
if token.type == TokenType.LPAREN:
self._advance() # Consume (
expr = self._parse_or_expr()
if self._match(TokenType.RPAREN):
self._advance() # Consume )
return f"({expr})"
if token.type == TokenType.EOF:
return ''
# Unexpected token - skip it
self._advance()
return ''
=============================================================================
Convenience Functions
=============================================================================
def parse_boolean_query(query: str) -> BooleanParseResult: """ Parse a Boolean search query and return FTS5-compatible syntax.
Args:
query: User search query with AND, OR, NOT operators
Returns:
BooleanParseResult with fts_query ready for FTS5 MATCH
Examples:
>>> parse_boolean_query("error AND auth")
BooleanParseResult(fts_query="error auth", ...)
>>> parse_boolean_query("deploy OR kubernetes")
BooleanParseResult(fts_query="deploy OR kubernetes", ...)
>>> parse_boolean_query("security NOT test")
BooleanParseResult(fts_query="security -test", ...)
"""
parser = BooleanSearchParser()
return parser.parse(query)
def to_fts5(query: str) -> str: """ Convert a Boolean search query to FTS5 syntax.
Simple wrapper that returns just the FTS5 query string.
Returns original query if parsing fails.
Args:
query: User search query
Returns:
FTS5-compatible query string
"""
result = parse_boolean_query(query)
return result.fts_query if result.success else query
def is_boolean_query(query: str) -> bool: """ Check if a query contains Boolean operators.
Args:
query: Search query to check
Returns:
True if query contains AND, OR, or NOT operators
"""
# Check for Boolean keywords (case-insensitive, word boundaries)
patterns = [
r'\bAND\b',
r'\bOR\b',
r'\bNOT\b',
]
query_upper = query.upper()
return any(re.search(p, query_upper) for p in patterns)
=============================================================================
Help Text
=============================================================================
BOOLEAN_SEARCH_HELP = """ Boolean Search Syntax (J.4.4.1-J.4.4.3)
OPERATORS: AND Require both terms (implicit between consecutive terms) OR Match either term NOT Exclude term
SYNTAX: term1 AND term2 Both terms required term1 OR term2 Either term matches NOT term Exclude term term1 AND NOT term2 First term required, second excluded "exact phrase" Match exact phrase prefix* Prefix wildcard *suffix Suffix wildcard (requires FTS5 tokenize)
GROUPING: (term1 OR term2) AND term3 Parentheses for precedence
EXAMPLES: /cxq "error AND authentication" /cxq "deploy OR kubernetes" /cxq "security NOT test" /cxq '"database connection" AND timeout' /cxq '(error OR warning) AND "stack trace"' /cxq "api* AND auth*"
NOTES:
- Operators are case-insensitive (and, AND, And all work)
- Space between terms implies AND
- Quotes preserve phrases with spaces
- Wildcards (*) match partial terms """
def get_help() -> str: """Return Boolean search help text.""" return BOOLEAN_SEARCH_HELP
=============================================================================
Main (for testing)
=============================================================================
if name == 'main': import sys
# Test queries
test_queries = [
"error AND authentication",
"deploy OR kubernetes",
"security NOT test",
"error AND NOT debug",
'"database connection" AND timeout',
'(error OR warning) AND "stack trace"',
"api* AND auth*",
"simple query",
'"exact phrase"',
"term1 term2 term3", # Implicit AND
"NOT excluded",
"(a OR b) AND (c OR d)",
]
if len(sys.argv) > 1:
# Parse command-line query
query = ' '.join(sys.argv[1:])
result = parse_boolean_query(query)
print(f"Input: {result.original}")
print(f"FTS5: {result.fts_query}")
print(f"Terms: {result.terms}")
print(f"Phrases: {result.phrases}")
print(f"Negated: {result.negated_terms}")
print(f"Wildcards: {result.has_wildcards}")
else:
# Run test queries
print("Boolean Search Parser Tests\n" + "=" * 40)
for query in test_queries:
result = parse_boolean_query(query)
status = "✓" if result.success else "✗"
print(f"{status} Input: {query}")
print(f" FTS5: {result.fts_query}")
if result.negated_terms:
print(f" Negated: {result.negated_terms}")
print()