diff options
| -rwxr-xr-x | camel_2_snake.py | 219 | ||||
| -rwxr-xr-x | run_format.sh | 4 |
2 files changed, 223 insertions, 0 deletions
diff --git a/camel_2_snake.py b/camel_2_snake.py new file mode 100755 index 00000000..5a652324 --- /dev/null +++ b/camel_2_snake.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 + +# This script changes variables names in C++ files from +# camelCase style to snake_case style, with corresponding +# naming conventions observed (e.g. "bIsFoo" => "is_foo") +# +# +# Limitations caused by lack of syntax awareness: +# a. despite best efforts (see besteffort_* below), in corner cases it leaves some +# variables initialized by ()-style expression untouched if the variable does +# not end with an underscore, as it thinks the variable is a function: +# what it does: variableName(initArgument) => variableName(init_argument) +# it should do: variableName(initArgument) => variable_name(init_argument) +# b. it replaces function name if it is used as a pointer, as it thinks the function +# is a variable: +# what it does: int (*funcPtr)(int, int) = &addInts; => int (*func_ptr)(int, int) = &add_ints; +# it should do: int (*funcPtr)(int, int) = &addInts; => int (*func_ptr)(int, int) = &addInts; +# c. it does not check name collision: +# what it does: bool isGreek, bIsGreek; => bool is_greek, is_greek; +# it should do: bool isGreek, bIsGreek; => bool is_greek, is_greek_2; + +import os, sys +import re +import argparse + +REGEX_PIECE_1 = r"(\A|(?<=\W))([a-jl-z]|[a-z]{2,})([A-Z][a-z]*|[0-9]+)+_?(?=[^\w\(]|$)" +REGEX_PIECE_2 = r"(\A|(?<=\W))([a-jl-z]|[a-z]{2,})([A-Z][a-z]*|[0-9]+)+_(?=\()" +REGEX_PIECE_IN_CTOR_INIT = r"(\A|(?<=\)\s:\s|\S\),\s)|(?<=\A:\s|\s\s))[a-z]+([A-Z][a-z]*|[0-9]+)+(?=\()" +DROMEDARY_CAMEL_CASE_VAR = re.compile(REGEX_PIECE_1 + "|" + REGEX_PIECE_2) +DROMEDARY_CAMEL_CASE_VAR_IN_CTOR_INIT = re.compile( + REGEX_PIECE_1 + "|" + REGEX_PIECE_2 + "|" + REGEX_PIECE_IN_CTOR_INIT) + +BOOLEAN_PREFIXES = [ + "is", "are", "was", "were", + "has", "have", "had", + "does", "do", "did", "done", + "find", "found", "get", "got" +] +COMMON_ABBREVIATIONS = { + # NOTE no "obj", "num", "it", "iter", "var", "src", "dest", + # "ret", "init", "ptr", "op", "db" + "res" : "result", "buf" : "buffer", "vec" : "vector", "msg" : "message", + "seq" : "sequence", "cnt" : "count", "mem" : "memory", "val" : "value", + "loc" : "location", "ans" : "answer", "ctx" : "context", "elem" : "element", + "ty" : "type", +} + +CAMEL_CASE_PIECE_REGEX = re.compile(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|[0-9]|_|$)|[0-9]+|\_") +def compute_snake_case(camel_case, testing=False): + splitted_words = CAMEL_CASE_PIECE_REGEX.findall(camel_case) + if testing: + print("%-15s => %s" % (camel_case, splitted_words)) + splitted_words = list(map(lambda w : w.lower(), splitted_words)) + ends_with_underscore = False + if splitted_words[-1] == '_': + splitted_words = splitted_words[:-1] + ends_with_underscore = True + assert len(splitted_words) >= 2 + # special rules in conversion: observe naming conventions for snake_case + had_hungarian_prefix = False + if ((splitted_words[0] == "p" or splitted_words[0] == "m" + or splitted_words[0] == "n" or splitted_words[0] == "f") + and splitted_words[1].isalpha()): + had_hungarian_prefix = True + splitted_words = splitted_words[1:] + if splitted_words[0] == "b" and splitted_words[1].isalpha(): + if splitted_words[1] in BOOLEAN_PREFIXES: + splitted_words = splitted_words[1:] + else: + splitted_words = [ "is" ] + splitted_words[1:] + if splitted_words[0] == "it": + splitted_words = [ "iter" ] + splitted_words[1:] + if splitted_words[-1] == "num": + splitted_words = splitted_words[:-1] + [ "number" ] + splitted_words = [ COMMON_ABBREVIATIONS.get(w, w) for w in splitted_words ] + # make snake_case + if not ((len(splitted_words) >= 1 if had_hungarian_prefix else len(splitted_words) >= 2) + and splitted_words[0].isalpha()): + raise RuntimeError("'%s' => %s" % (camel_case, splitted_words)) + snake_case = '_'.join(splitted_words) + ('_' if ends_with_underscore else '') + return snake_case + +MAX_LOOP_STEPS = 16 # unlikely to have more than this number of camelCase variables in one line +def process_one_line(old_line, in_ctor_init_list=False, testing=False): + step_count, instance_count = 0, 0 + line = old_line + while True: + step_count += 1 + if step_count > MAX_LOOP_STEPS: + raise RuntimeError("maximum loop steps (%d) exceeded, line:\n%s" % ( + MAX_LOOP_STEPS, old_line)) + regex_obj = DROMEDARY_CAMEL_CASE_VAR + if in_ctor_init_list: + regex_obj = DROMEDARY_CAMEL_CASE_VAR_IN_CTOR_INIT + matchObj = re.search(regex_obj, line) + if not matchObj: + return line, instance_count + camel_case_var = matchObj.group(0) + camel_case_var_start, camel_case_var_end = matchObj.start(), matchObj.end() + instance_count += 1 + snake_case_var = compute_snake_case(camel_case_var, testing) + line = line[:camel_case_var_start] + snake_case_var + line[camel_case_var_end:] + +def process_one_file(filepath, handling_dir, echo, rewrite): + with open(filepath, 'r') as f: + raw_lines = f.readlines() + instance_count, new_lines = 0, [] + if echo: + print() # newline + # best effort of determing whether "foo(a)" is a ()-style initialization or a function call + besteffort_in_ctor_init_list = False + for i, raw_line in enumerate(raw_lines): + old_line = raw_line.rstrip() + ### check if in ctor initializer list (1) + besteffort_index_of_colon = old_line.find(": ") # -1 if absent + if besteffort_index_of_colon >= 0: + if ((besteffort_index_of_colon >= 2 + and old_line[besteffort_index_of_colon - 2:besteffort_index_of_colon] == ") " + and " ?" not in old_line[:besteffort_index_of_colon]) + or (i >= 1 and len(old_line[:besteffort_index_of_colon].strip()) == 0 + and raw_lines[i - 1].rstrip().endswith(")") + and (" ?" not in raw_lines[i - 1]))): + besteffort_in_ctor_init_list = True + ### essential works + new_line, instance_count_in_line = process_one_line( + old_line, besteffort_in_ctor_init_list) + if echo: + ctor_init_list_mark = "\x1b[48;5;237m" if besteffort_in_ctor_init_list else "\x1b[0m" + if instance_count_in_line > 0: + print("-|\x1b[0m" + ctor_init_list_mark + + "\x1b[35m" + old_line + "\x1b[0m") + print("+|\x1b[0m" + ctor_init_list_mark + + "\x1b[32m" + new_line + "\x1b[0m") + else: + print(" |\x1b[0m" + ctor_init_list_mark + old_line + "\x1b[0m") + new_lines.append(new_line) + instance_count += instance_count_in_line + ### check if in ctor initializer list (2) + if besteffort_in_ctor_init_list: + besteffort_index_of_open_brace = old_line.find(" {") # -1 if absent + if besteffort_index_of_open_brace >= 0: + besteffort_in_ctor_init_list = False + if rewrite and instance_count: + with open(filepath, 'w') as f: + f.write('\n'.join(new_lines) + "\n\n") + if not rewrite and not echo and not handling_dir: + sys.stderr.write("--- begin : %s ---\n" % filepath) + sys.stdout.write('\n'.join(new_lines) + "\n\n") + sys.stderr.write("--- end : %s ---\n" % filepath) + return instance_count + +def is_c_cxx(filename): + if (filename.endswith(".h") or filename.endswith(".cc") + or filename.endswith(".cpp") or filename.endswith(".c")): + return True + return False + +def work(args): + if args.test != None: + new_line, _ = process_one_line(args.test, testing=True) + print("\x1b[32;m" + new_line + "\x1b[0m") + return + files_to_read, handling_dir = [], False + if os.path.isfile(args.path): + files_to_read = [ args.path ] + else: + assert os.path.isdir(args.path) + handling_dir = True + for (dirpath, dirnames, filenames) in os.walk(args.path, followlinks=True): + for filename in filter(lambda f : is_c_cxx(f), filenames): + if (((os.sep + "test-inputs") in dirpath) + or ((os.sep + "third-party") in dirpath) + or ((os.sep + "linters") in dirpath)): + continue + files_to_read.append(os.path.join(dirpath, filename)) + processed_instances_sum = 0 + for filepath in files_to_read: + sys.stderr.write("%s .." % (filepath)) + sys.stderr.flush() + processed_instances = process_one_file( + filepath, handling_dir=handling_dir, echo=args.echo, rewrite=args.rewrite) + processed_instances_sum += processed_instances + sys.stderr.write("\r%s count: %d\n" % (filepath, processed_instances)) + sys.stderr.flush() + if not args.rewrite: + if handling_dir: + sys.stderr.write("file count: %d, instance count: %d\n" % ( + len(files_to_read), processed_instances_sum)) + sys.stderr.write("\nTo rewrite files, use '--rewrite'; to echo lines, use '--echo'\n") + return 0 + +def main(): + parser = argparse.ArgumentParser(description="C/C++ vairable name camelCase => snake_case") + parser.add_argument("path", nargs='?', default=None, + help="file or directory") + parser.add_argument("--rewrite", action="store_true", + help="rewrite visited C/C++ files [[caution advised]]") + parser.add_argument("-e", "--echo", action='store_true', + help="echo each line, before and after") + parser.add_argument("-t", "--test", metavar="\"..\"", type=str, default=None, + help="(dev) test one line in \"..\"") + args = parser.parse_args() + has_error = False + if args.test == None and args.path == None: + has_error = True + sys.stderr.write("[Error] you need to give argument 'path' or use option '--test'\n") + if args.test != None and (args.path != None or args.rewrite or args.echo): + has_error = True + sys.stderr.write("[Error] '--test \"..\"' can only be used alone, but you gave something else too\n") + if args.test == None and (args.rewrite or args.echo) and args.path == None: + has_error = True + sys.stderr.write("[Error] you need to give the path argument while using '--rewrite' or '--echo'\n") + if args.path != None and not os.path.exists(args.path): + has_error = True + sys.stderr.write("[Error] not found: %s\n" % args.path) + return 1 if has_error else work(args) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/run_format.sh b/run_format.sh index 9385aed6..7e7189a0 100755 --- a/run_format.sh +++ b/run_format.sh @@ -14,3 +14,7 @@ echo -e "Files found to format = \n\"\"\"\n$FILE_LIST\n\"\"\"" # mistakenly see the entire blob of newline-separated file names as a SINGLE file name instead # of as a new-line separated list of *many* file names! clang-format --verbose -i --style=file $FILE_LIST + +foreach i ( $FILE_LIST ) + ./camel_2_snake.py i +end |
