I’ve got a scanner method that does some cool whitespace handling. Unfortunately, it’s really really ugly. Has anyone got some refactoring advice on this function? I think maybe I’m trying to do too much in the scanning stage… maybe I need to move the indent-tracking to a pruning stage method.
Here it is:
def scan(self):
"""scan source code to match and build token stream
dispose of non-tab whitespace"""
### REFACTOR THIS SUCKER
assert self.code, "ERROR: empty source code"
tokens = []
newline_token = None
for line_number in range(len(self.code)):
line = self.code[line_number]
# start current code line with NewLine token
newline_token = NewLine(line_number, newline_token)
tokens.append(newline_token)
indent_tally = 0
i = 0
while i < len(line):
token_result = self.match_token(line, i, newline_token)
assert token_result, f"ERROR: no match for match_token('{line[i:]}')"
if newline_token.empty_flag and token_result.token_id == 'INDENT':
# count leading indents
indent_tally += 1
elif token_result.token_id != 'WHITESPACE' and token_result.token_id != 'INDENT':
# stop counting indents if non-whitespace token found
newline_token.empty_flag = False
tokens.append(token_result)
i += len(token_result.lexeme)
## record indent levels
if not newline_token.empty_flag:
#Non-empty line gets indent tally
newline_token.indent_level = indent_tally
elif newline_token.prev_newline:
# EMPTY line preserves indent level from previous line
newline_token.line_number = newline_token.prev_newline.line_number
else:
# first token leaves indent at 0
pass
if tokens and tokens[0].indent_level > 0:
assert False, "Unexpected leading indent at line 0"
return tokens