diff --git a/README b/README index af681d9..6242267 100644 --- a/README +++ b/README @@ -1,5 +1,5 @@ ############################################################################################## -# Copyright 2018 The Johns Hopkins University Applied Physics Laboratory LLC +# Copyright 2019 The Johns Hopkins University Applied Physics Laboratory LLC # All rights reserved. # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software @@ -16,43 +16,50 @@ # # HAVE A NICE DAY. -################################################################################# -#### Local Function Affinity - Detecting Object File Boundaries in IDA Pro #### -################################################################################# +################################################################# +#### CodeCut - Detecting Object File Boundaries in IDA Pro #### +################################################################# **** Terminology **** I tend to use the term "module" for a set of related functions within a binary that came from a single object file. So you will see the terms "module" and -"object file" used interchangeabley in the LFA source and documentation. +"object file" used interchangeabley in the CC source and documentation. **** Dependencies **** -The only dependency is Natural Language Toolkit (NLTK) - which you can get -from https://www.nltk.org. +CodeCut relies on: +Natural Language Toolkit (NLTK) - https://www.nltk.org +Snap.py - https://snap.stanford.edu/snappy/ **** Source Files **** -lfa.py - Main program - written to run as a script from IDA - Pro. By default, will analyze either the .text - section or the full database, and output the files - listed in the "Output Files" section. *** Make sure to - set IDA_VERSION before you start! *** +cc_main.py - Main entry point - simply load this up with the + "File -> Script file..." option in IDA. -basicutils<_6x,_7x>.py - provides a version-agnostic API to IDA. You need to - set a definition in lfa.py to select which version - of IDA you have. +lfa.py - Analysis engine for LFA. + +mc.py - Analysis engine for MaxCut. + +basicutils_7x.py - Provides an API to IDA - maybe one day we'll get this + ported to Ghidra! map_read.py - For research purposes - compares a ground truth .map - file (from ld) to a .map file from LFA and produces + file (from ld) to a .map file from CC and produces a score. See RECON slides or the code itself for more info. You need to add the option -Map=.map to the linker options in a Makefile to get a .map file. The syntax to map_read is: - python map_read.py + python map_read.py -**** LFA Parameters, Interpolation, and Output **** +**** MaxCut Parameters **** + + - Right now there is only one parameter for MaxCut, a value for the maximum + module size (currently set to 16K). + + +**** LFA Parameters & Interpolation **** A couple areas for research: @@ -81,34 +88,35 @@ A couple areas for research: **** Output Files **** -LFA produces 4 files: +CodeCut produces 7 files: -_lfa_results.csv - Raw score output from LFA, including where edges - are detected. Graphs can fairly easily be - generated in your favorite spreadsheet program. +_cc_results.csv - Raw score output from LFA and MaxCut, including where + edges are detected. Graphs can fairly easily be + generated in your favorite spreadsheet program. -_lfa_labels.py - Script that can be used to label your DB with LFA's - output. After determining module boundaries, LFA - attempts to guess the name (fun!) by looking at - common strings used by the module. Y can use this - script as a scratchpad to name unnamed modules as you - determine what they are, or you can also use other - functions in basicutils to change module names later. +_{lfa,mc}_labels.py - Script that can be used to label your DB with CC's + output. After determining module boundaries, CC + attempts to guess the name (fun!) by looking at + common strings used by the module, for both the + LFA and MaxCut module lists. You can use this + script as a scratchpad to name unnamed modules as you + determine what they are, or you can also use other + functions in basicutils to change module names later. -_lfa_map.map - A .map file similar to the output from the ld. This is - for the purposes of comparing to a ground truth .map - file to test LFA when you have source code. +_{lfa,mc}_map.map - A .map file similar to the output from the ld. This is + for the purposes of comparing to a ground truth .map + file to test CC when you have source code. -_lfa_mod_graph.gv - a Graphviz graph file of the module relationships - This is a directed graph where a -> b indicates - that a function in module a calls a function in - module b. This may take a long time to render if - you have a large binary (more than a couple - hundred modules detected). For smaller binaries - this can pretty clearly communicate the software - architecture immediately. For larger binaries - this will show you graphically the most heavily - used modules in the binary. +_{lfa,mc}_mod_graph.gv - a Graphviz graph file of the module relationships + This is a directed graph where a -> b indicates + that a function in module a calls a function in + module b. This may take a long time to render if + you have a large binary (more than a couple + hundred modules detected). For smaller binaries + this can pretty clearly communicate the software + architecture immediately. For larger binaries + this will show you graphically the most heavily + used modules in the binary. You can use sfdp to render the graph into a PNG file with a command line like: diff --git a/basicutils_7x.py b/basicutils_7x.py index a12bdcf..720d7f4 100644 --- a/basicutils_7x.py +++ b/basicutils_7x.py @@ -58,7 +58,12 @@ def PrevFunction(x): MAX_OPCODE_LEN = 15 def PrevInstr(ea): # TODO this will return an inst_t type. Need to figure out how to populate it/make workflow happy - return ida_ua.decode_prev_insn(ea, ea-MAX_OPCODE_LEN) + out=ida_ua.insn_t() + ida_ua.decode_prev_insn(out, ea) + return out.ea + +def CodeRefsTo(target): + return idautils.CodeRefsTo(target,0) def ForEveryUniqXrefTo( target, fun ): a = 0 @@ -89,6 +94,18 @@ def ForEveryFuncInDb( fun ): fun(f) f=NextFunction(f) +def ForEveryFuncInSeg( seg, fun ): + start,end = SegByName(".text") + if (start == BADADDR): + start = NextFunction(0) + end = BADADDR + f = start + while (f < end): + """print "ev: %#x" % f""" + fun(f) + f=NextFunction(f) + + def NFuncUp( fun, n ) : i=0 f=fun @@ -248,7 +265,7 @@ def RenameFuncWithNewMod(f,mod): parts = n.split("_") new_name = "%s_%s_%08x" % (mod,parts[1],f) print "Renaming %s to %s\n" % (n, new_name) - ida_name.set_name(f,new_name) # TODO confirm this works... + ida_name.set_name(f,new_name) #Rename a module (all functions that start with _) def RenameMod(orig, new): diff --git a/cc_base.py b/cc_base.py new file mode 100644 index 0000000..6c32a6c --- /dev/null +++ b/cc_base.py @@ -0,0 +1,143 @@ +############################################################################################## +# Copyright 2019 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +# +# HAVE A NICE DAY. + +import basicutils_7x as basicutils + +## CodeCut Basics +## A couple of functions for working with function and module lists and outputting results + +#locate_module() +#Return the module information for a given function +#This assumes that the module list is in order, but not necessarily contiguous +def locate_module(module_list, f): + found=0 + c=0 + #print "Finding %08x in module list length: %d" % (f,len(module_list)) + while ( (found != 1) and (c < len(module_list))): + m = module_list[c] + #print "\t%x - %x: %s" % (m.start,m.end,m.name) + #this is the case where a function falls in the cracks between modules (because it wasn't cool enough to get a score) + if (f < m.start): + found = 1 + ret = None + elif ((f >= m.start) and (f <= m.end)): + found = 1 + ret = m + c+=1 + return m + + +#gen_mod_graph() +#Output a module-to-module call graph in GraphViz format +#For each module m_1 +# For each function in the module +# For each function that calls +# Lookup the module info for m_2 +# If it's been assigned a module, add edge m_1 -> m_2 to the graph +def gen_mod_graph(module_list, suffix): + c=0 + g=set() + while (c < len(module_list)): + m = module_list[c] + f = m.start + while (f <= m.end): + for xref in basicutils.FuncXrefsFrom(f): + target = locate_module(module_list,xref) + if (target): + g.add((m.name,target.name)) + f = basicutils.NextFunction(f) + c+=1 + + root_name = basicutils.GetInputFile() + file = open(root_name + "_" + suffix + "_mod_graph.gv", "wb") + + file.write("digraph g {\n") + + for (node1,node2) in g: + line = "%s -> %s\n" % (node1,node2) + file.write(line) + + file.write("}\n") + file.close() + +#gen_rename_script() +#Output the module list with names as a Python script +#This script can then be run on the database if in the same directory as the basicutils libraries +#Look at basicutils.RenameRangeWithAddr to see the "canonical" name format - +# you can also tweak that function to use a different naming convention +def gen_rename_script(module_list, suffix): + c=0 + + root_name = basicutils.GetInputFile() + file = open(root_name + "_" + suffix + "_labels.py", "wb") + + #if (IDA_VERSION < 7): + # file.write("import basicutils_6x as basicutils\n"); + #else: + file.write("import basicutils_7x as basicutils\n"); + file.write("\ndef go():\n"); + + while (c.csv - which can be opened in your favorite spreadsheet program +def print_results(function_list, module_list1, module_list2): + c=0 + root_name = basicutils.GetInputFile() + file = open(root_name + "_cc_results.csv", "wb") + + #write header + file.write("Function,Function #,LFA Score 1,LFA Score 2,LFA Total,LFA Edge,MC Edge,Function Name,Suggested Mod Name (LFA), Suggested Mod Name(MC)\n"); + + while (c= m.start) and (f <= m.end)): - found = 1 - ret = m - c+=1 - return m - -#gen_mod_graph() -#Output a module-to-module call graph in GraphViz format -#For each module m_1 -# For each function in the module -# For each function that calls -# Lookup the module info for m_2 -# If it's been assigned a module, add edge m_1 -> m_2 to the graph -def gen_mod_graph(): - global g_module_list - c=0 - g=set() - while (c < len(g_module_list)): - m = g_module_list[c] - f = m.start - while (f <= m.end): - for xref in basicutils.FuncXrefsFrom(f): - target = locate_module(xref) - if (target): - g.add((m.name,target.name)) - f = basicutils.NextFunction(f) - c+=1 - - root_name = basicutils.GetInputFile() - file = open(root_name + "_lfa_mod_graph.gv", "wb") - - file.write("digraph g {\n") - - for (node1,node2) in g: - line = "%s -> %s\n" % (node1,node2) - file.write(line) - - file.write("}\n") - file.close() - -#gen_rename_script() -#Output the module list with names as a Python script -#This script can then be run on the database if in the same directory as the basicutils libraries -#Look at basicutils.RenameRangeWithAddr to see the "canonical" name format - -# you can also tweak that function to use a different naming convention -def gen_rename_script(): - global g_module_list - c=0 - - root_name = basicutils.GetInputFile() - file = open(root_name + "_lfa_labels.py", "wb") - - if (IDA_VERSION < 7): - file.write("import basicutils_6x as basicutils\n"); - else: - file.write("import basicutils_7x as basicutils\n"); - file.write("\ndef go():\n"); - - while (c should be a nonsense word, and will show up in the list -def string_range_tokenize(start,end,sep): - # get all string references in this range concatenated into a single string - t = basicutils.CompileTextFromRange(start,end,sep) - - #Enable this if you already have a bunch of function names and want to include that in the mix - #t+= basicutils.CompileFuncNamesFromRangeAsText(start,end,sep) - - #print "string_range_tokenize: raw text:" - #print t - #remove printf/sprintf format strings - tc = re.sub("%[0-9A-Za-z]+"," ",t) - #convert dash to underscore - tc = re.sub("-","_",tc) - #replace _ and / with space - may want to turn this off sometimes - #this will break up snake case and paths - #problem is that if you have a path that is used throughout the binary it will probably dominate results - tc = re.sub("_"," ",tc) - #replace / and \\ with a space - tc = re.sub("[/\\\\]"," ",tc) - #remove anything except alphanumeric, spaces, . (for .c, .cpp, etc) and _ - tc = re.sub("[^A-Za-z0-9_\.\s]"," ",tc) - - #lowercase it - and store this as the original set of tokens to work with - tokens = [tk.lower() for tk in tc.split()] - - #remove English stop words - #this is the list from the MIT *bow project - eng_stopw = {"about","all","am","an","and","are","as","at","be","been","but","by","can","cannot","did","do","does","doing","done","for","from","had","has","have","having","if","in","is","it","its","of","on","that","the","these","they","this","those","to","too","want","wants","was","what","which","will","with","would"} - #remove "code" stop words - #e.g. common words in debugging strings - code_sw = {"error","err","errlog","log","return","returned","byte","bytes","status","len","length","size","ok","0x","warning","fail","failed","failure","invalid","illegal","param","parameter","done","complete","assert","assertion","cant","didnt","class","foundation","cdecl","stdcall","thiscall"} - stopw = eng_stopw.union(code_sw) - c = 0 - - tokens_f = [] - - for t in tokens: - if t not in stopw: - tokens_f.append(t) - - return tokens_f - -#bracket_strings(start,end,b_brack,e_brack): -#Return the most common string in the range that begins with b_brack and ends with e_brack -# The count of how many times this string appeared is also returned -#I find somewhat often people format debug strings like "[MOD_NAME] Function X did Y!" -#This function is called by guess_module_names() - if you see this format with different brackets -#you can edit that call -def bracket_strings(start,end,b_brack,e_brack): - sep = "tzvlw" - t = basicutils.CompileTextFromRange(start,end,sep) - tokens = [tk.lower() for tk in t.split(sep)] - - b=[] - for tk in tokens: - tk = tk.strip() - - if tk.startswith(b_brack) : - b_contents = tk[1:tk.find(e_brack)] - #Hack to get rid of [-],[+],[*] - could also try to remove non alpha - if (len(b_contents) > 3): - b.append(tk[1:tk.find(e_brack)]) - - #print "bracket_strings tokens:" - #print tokens - #print b - - u_gram="" - u_gram_score=0 - if (len(b) > 0): - f = nltk.FreqDist(b) - u_gram = f.most_common(1)[0][0] - u_gram_score = f.most_common(1)[0][1] - - return (u_gram,u_gram_score) - -#source_file_strings(start,end): -#Return the most common string that looks like a source file name in the given range -# The count of how many times this string appeared is also returned -def source_file_strings(start,end): - sep = "tzvlw" - t = basicutils.CompileTextFromRange(start,end,sep) - #normally would do lower here to normalize but we lose camel case that way - tokens = [tk for tk in t.split(sep)] - - #for each string, remove quotes and commas, then tokenize based on spaces to generate the final list - tokens2=[] - for tk in tokens: - tk = tk.strip() - #strip punctuation, need to leave in _ for filenames and / and \ for paths - tk = re.sub("[\"\'\,]"," ",tk) - for tk2 in tk.split(" "): - tokens2.append(tk2) - - b=[] - for tk in tokens2: - tk = tk.strip() - if tk.endswith(".c") or tk.endswith(".cpp") or tk.endswith(".cc"): - #If there's a dir path, only use the end filename - #This could be tweaked if the directory structure is part of the software architecture - #e.g. if there are multiple source directories with meaningful names - if tk.rfind("/") != -1: - ntk = tk[tk.rfind("/")+1:] - elif tk.rfind("\\") != -1: - ntk = tk[tk.rfind("\\")+1:] - else: - ntk = tk - b.append(ntk) - - #print "source_file_strings tokens:" - #print tokens - #print b - - #a better way to do this (if there are multiple) - #would be to sort, uniquify, and then make the name foo.c_and_bar.c - u_gram="" - u_gram_score=0 - if (len(b) > 0): - f = nltk.FreqDist(b) - u_gram = f.most_common(1)[0][0] - u_gram_score = f.most_common(1)[0][1] - - return (u_gram,u_gram_score) - -#common_strings(start,end): -#Return a list of the common strings in the given range -#Uses NLTK to generate a list of unigrams, bigrams, and trigrams (1 word, 2 word phrase, 3 word phrase) -#If the trigram score > 1/2 * bigram score, the most common trigram is used -#If the bigram score > 1/2 * unigram score, the most common bigram is used -#Otherwise the most common unigram (single word is used) -def common_strings(start,end): - CS_THRESHOLD = 6 - sep = "tvlwz" - - tokens = string_range_tokenize(start,end,sep) - - #make a copy since we're going to edit it - u_tokens = tokens - c=0 - while (c= u_gram_score): - if (t_gram_score * 2 >= b_gram_score): - ret = t_str - ret_s = t_gram_score - else: - ret = b_str - ret_s = b_gram_score - else: - ret = u_gram - ret_s = u_gram_score - - #print "%08x - %08x : %s" % (start,end,ret) - - return (ret,ret_s) - -### End of NLP Section ### - - #func_callers_weight(f): #Return the LFA score for functions that this functions calls (i.e. the "calls from" score) #If there are no references, return 0 @@ -458,7 +102,7 @@ def func_callee_weight(f): fc = 0 fs = 0 a = 0 - for xref in idautils.CodeRefsTo(f,0): + for xref in basicutils.CodeRefsTo(f): dist = abs(xref - f) #print "%08x: %08x %d " % (f, xref, dist), @@ -519,7 +163,10 @@ def func_call_weight(f_start, f_end): prevscore_2 = 0 z1 = 1 z2 = 1 - f = idc.NextFunction(f) + finf = module.func_info(f,0,0) + finf.lfa_skip=1 + g_function_list.append(finf) + f = basicutils.NextFunction(f) continue #if 1st or 2nd score is zero, interpolate using previous score and an assumed negative linear slope @@ -543,13 +190,47 @@ def func_call_weight(f_start, f_end): print "0x%08x, %d , %f, %f, %f" % (f, c,score_1, score_2, total_score) #Add scores to the global function score list - finf = func_info(f,score_1,score_2) + finf = module.func_info(f,score_1,score_2) + finf.lfa_skip=0 g_function_list.append(finf) line = "0x%08x, %d , %f, %f, %f\n" % (f,c,score_1, score_2, total_score) f=basicutils.NextFunction(f) c+=1 - + +#get_last _three and get_lfa_start: +#Previously LFA would just skip functions if they had no caller or callee score +#it would effectively drop them. This meant that when doing edge detection we +#knew every function in the function list had a score. Now we're putting all +#functions in the function list, and we have a "skip" field if LFA should skip it +#for scoring purposes. So these functions help parse that skip field, since for +#edge detection we look at the previous three scores. +def get_last_three(index): + c=0 + i = index-1 + p=[] + while ((c<3) and (i>0)): + print "get_last_3: %d,%d" % (c,i) + if (g_function_list[i].lfa_skip == 0): + p.append(g_function_list[i]) + c+=1 + i-=1 + if (c==3): + return p[0],p[1],p[2] + else: + print "Error: could not find 3 scored entries before index: %d (%d,%d)" % (index, i, c) + return 0,0,0 + +def get_lfa_start(): + c=0; + i=0; + while (c < 4): + print "get_lfa_start: %d,%d" % (c,i) + if (g_function_list[i].lfa_skip==0): + c+=1 + i+=1 + return i + #edge_detect(): # Determine boundaries between object files # Edge condition is a delta of at least 2 where the current score is positive @@ -561,92 +242,44 @@ def edge_detect(): #For published research EDGE_THRESHOLD = 2 - c=3 + c=get_lfa_start() #do edge detection while (c 0) and ((s - p_1) > EDGE_THRESHOLD)): - #if 2 of last 3 were negative - m = sorted([p_1,p_2,p_3]) - if (m[1] < 0): - g_function_list[c].edge=1 + #TODO: this is not working as previously intended + #because the last 3 can have "skipped" entries in them + if (g_function_list[c].lfa_skip == 0): + f_1,f_2,f_3 = get_last_three(c) + p_1 = f_1.total_score + p_2 = f_2.total_score + p_3 = f_3.total_score + #p_1 = g_function_list[c-1].total_score + #p_2 = g_function_list[c-2].total_score + #p_3 = g_function_list[c-3].total_score + s = g_function_list[c].total_score + #if score is positive and it is diff of at least 2 from previous + #and the previous function was not an edge + if ((not f_1.edge[0] == 1) and (s > 0) and ((s - p_1) > EDGE_THRESHOLD)): + #if 2 of last 3 were negative + m = sorted([p_1,p_2,p_3]) + if (m[1] < 0): + g_function_list[c].edge[0]=1 c+=1 #assign modules based on where the edges are c=0 mod_start = g_function_list[0].loc while(c.csv - which can be opened in your favorite spreadsheet program -def print_results(): +#Main entry point - returns an LFA module list and a global function list (with the LFA module edges marked) +def analyze(): global g_function_list - c=0 - root_name = basicutils.GetInputFile() - file = open(root_name + "_lfa_results.csv", "wb") - - #write header - file.write("Function,Function #,Score 1,Score 2,Total,Edge,Function Name,Suggested Module Name\n"); - - while (c= region_start) and (start <= region_end): + NIdV.Add(start) + if (start > region_end): + break + return snap.GetSubGraph(graph, NIdV) + +#make_cut() +#This function analyzes the region specified and returns the cut address for the address with the +#maximum score, i.e. the address that has the highest average distance call length of function calls +#that go across the address. If multiple addresses with zero calls are found (inf. score) the one +#closest to the middle of the region is returned. +def make_cut(region_start, region_end, graph): + + print "make_cut: start: 0x%x end: 0x%x" % (region_start,region_end) + + weight = {} + z = 0 + zeroes = [] + for Node in graph.Nodes(): + start = Node.GetId() + #iterate only over nodes in this region + cut_address = start - 1 + if cut_address < region_start: + continue + + weight[cut_address] = 0 + edge_count = 0 + + for Edge in graph.Edges(): + edge_start = Edge.GetSrcNId() + edge_end = Edge.GetDstNId() + #only look at edges that cross the possible cut address + #handle both cases for the directed graph + if (edge_start < cut_address and edge_end > cut_address) or (edge_end < cut_address and edge_start > cut_address): + #print " cut %x, %x to %x cross" % (cut_address,edge_start,edge_end) + weight[cut_address] += abs(edge_end - edge_start) + edge_count +=1 + + #If we have a place where we have no edges crossing - keep track of it + #We will pick the place closest to the center of the module + if edge_count == 0: + print " returning 0 weight count at: 0x%0x" % cut_address + z+=1 + zeroes.append(cut_address) + weight[cut_address] = 0 + else: + weight[cut_address] = weight[cut_address]/ edge_count + #print "w: %x: %x" % (cut_address, weight[cut_address]) + + #if we had edges with zero crossings, pick the one closest to the center + if (z > 0): + print " total of %d zero weight counts" % (z) + center = region_start + ((region_end-region_start)/2) + min_dist = sys.maxint + for i in xrange(z): + dist = abs(center - zeroes[i]) + if dist < min_dist: + min_dist = dist + min_zero = zeroes[i] + print " returning zero cut at addr: %x" % min_zero + return min_zero + + #otherwise pick the edge with the maximum weight score + max_weight=0 + #print " weight table:" + for addr,w in weight.iteritems(): + #print " %x: %x" % (addr,w) + if w > max_weight: + max_addr = addr + max_weight = w + + print " returning max weight: %x at addr: 0x%x" % (max_weight,max_addr) + return max_addr + +#do_cutting() +#This is the main recursive algorithm for MaxCut +#Find a cut address, split the graph into two subgraphs, and recurse on those subgraphs +#Stop if the area being cut is below a particular threshold +def do_cutting(start, end, graph): + nodes = graph.GetNodes() + print "do_cutting: start: 0x%x end: 0x%x nodes: 0x%x" % (start, end, nodes) + #set this way for simple + #THRESHOLD = 0x100 + #THRESHOLD = 0x1000 + THRESHOLD = 0x4000 + + if (end - start > THRESHOLD) and (nodes > 1): + cut_address = make_cut(start, end,graph) + + graph1 = make_subgraph(start,cut_address,graph) + graph2 = make_subgraph(cut_address+1,end,graph) + + do_cutting(start,cut_address,graph1) + do_cutting(cut_address+1,end,graph2) + else: + print "Module 0x%x to 0x%x" % (start, end) + b_mod = module.bin_module(start,end,0,"") + g_maxcut_modlist.append(b_mod) + +#func_list_annotate() +#This function copies our list of modules into the function list +#This allows us to have a single function list with modules from multiple algorithms (LFA and MaxCut) +def func_list_annotate(flist): + c=0 + m=0 + while (m < len(g_maxcut_modlist)): + start = g_maxcut_modlist[m].start + while (flist[c].loc < start): + #print "F: %08x M: %08x" % (flist[c].loc, start) + c+=1 + if (c == len(flist)): + print "Error: Maxcut module list does not reconcile with function list" + return None + flist[c].edge[1]=1 + #print "MC: Set %08x func edge to 1" % flist[c].loc + m+=1 + return flist + +#Main entry point +#Returns a global function list (annotated with MaxCut edges) and a global module list +def analyze(flist): + + sys.setrecursionlimit(5000) + UGraph = snap_cg.create_snap_cg() + + g_min_node=sys.maxint + g_max_node=0 + + for Node in UGraph.Nodes(): + id = Node.GetId() + if id < g_min_node: + g_min_node = id + if id > g_max_node: + g_max_node = id + + do_cutting(g_min_node,g_max_node, UGraph) + + r_flist = func_list_annotate(flist) + + return r_flist,g_maxcut_modlist + + diff --git a/modnaming.py b/modnaming.py new file mode 100644 index 0000000..4f970ba --- /dev/null +++ b/modnaming.py @@ -0,0 +1,309 @@ +############################################################################################## +# Copyright 2018 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +# +# HAVE A NICE DAY. + +IDA_VERSION = 7 + +if (IDA_VERSION < 7): + import idc + import struct + import idautils + import basicutils_6x as basicutils +else: + import ida_idaapi + import ida_idc + import ida_funcs + import ida_nalt + import ida_segment + import idautils + import basicutils_7x as basicutils + +import math +import nltk +import nltk.collocations +import re + + +### NLP Section ### + +# This section of code attempts to name the modules based on common strings in the string references +# Not really based on any sound science or anything - your mileage may heavily vary. :-D + +#string_range_tokenize(start,end,sep): +#Compile all string references between start and end as a list of strings (called "tokens") +# should be a nonsense word, and will show up in the list +def string_range_tokenize(start,end,sep): + # get all string references in this range concatenated into a single string + t = basicutils.CompileTextFromRange(start,end,sep) + + #Enable this if you already have a bunch of function names and want to include that in the mix + #t+= basicutils.CompileFuncNamesFromRangeAsText(start,end,sep) + + #print "string_range_tokenize: raw text:" + #print t + #remove printf/sprintf format strings + tc = re.sub("%[0-9A-Za-z]+"," ",t) + #convert dash to underscore + tc = re.sub("-","_",tc) + #replace _ and / with space - may want to turn this off sometimes + #this will break up snake case and paths + #problem is that if you have a path that is used throughout the binary it will probably dominate results + tc = re.sub("_"," ",tc) + #replace / and \\ with a space + tc = re.sub("[/\\\\]"," ",tc) + #remove anything except alphanumeric, spaces, . (for .c, .cpp, etc) and _ + tc = re.sub("[^A-Za-z0-9_\.\s]"," ",tc) + + #lowercase it - and store this as the original set of tokens to work with + tokens = [tk.lower() for tk in tc.split()] + + #remove English stop words + #this is the list from the MIT *bow project + eng_stopw = {"about","all","am","an","and","are","as","at","be","been","but","by","can","cannot","did","do","does","doing","done","for","from","had","has","have","having","if","in","is","it","its","of","on","that","the","these","they","this","those","to","too","want","wants","was","what","which","will","with","would"} + #remove "code" stop words + #e.g. common words in debugging strings + code_sw = {"error","err","errlog","log","return","returned","byte","bytes","status","len","length","size","ok","0x","warning","fail","failed","failure","invalid","illegal","param","parameter","done","complete","assert","assertion","cant","didnt","class","foundation","cdecl","stdcall","thiscall"} + stopw = eng_stopw.union(code_sw) + c = 0 + + tokens_f = [] + + for t in tokens: + if t not in stopw: + tokens_f.append(t) + + return tokens_f + +#bracket_strings(start,end,b_brack,e_brack): +#Return the most common string in the range that begins with b_brack and ends with e_brack +# The count of how many times this string appeared is also returned +#I find somewhat often people format debug strings like "[MOD_NAME] Function X did Y!" +#This function is called by guess_module_names() - if you see this format with different brackets +#you can edit that call +def bracket_strings(start,end,b_brack,e_brack): + sep = "tzvlw" + t = basicutils.CompileTextFromRange(start,end,sep) + tokens = [tk.lower() for tk in t.split(sep)] + + b=[] + for tk in tokens: + tk = tk.strip() + + if tk.startswith(b_brack) : + b_contents = tk[1:tk.find(e_brack)] + #Hack to get rid of [-],[+],[*] - could also try to remove non alpha + if (len(b_contents) > 3): + #Hack for debug prints that started with [0x%x] + if (b_contents != "0x%x"): + b.append(tk[1:tk.find(e_brack)]) + + print "bracket_strings tokens:" + print tokens + print b + + u_gram="" + u_gram_score=0 + if (len(b) > 0): + f = nltk.FreqDist(b) + u_gram = f.most_common(1)[0][0] + u_gram_score = f.most_common(1)[0][1] + + return (u_gram,u_gram_score) + +#source_file_strings(start,end): +#Return the most common string that looks like a source file name in the given range +# The count of how many times this string appeared is also returned +def source_file_strings(start,end): + sep = "tzvlw" + t = basicutils.CompileTextFromRange(start,end,sep) + #normally would do lower here to normalize but we lose camel case that way + tokens = [tk for tk in t.split(sep)] + + #for each string, remove quotes and commas, then tokenize based on spaces to generate the final list + tokens2=[] + for tk in tokens: + tk = tk.strip() + #strip punctuation, need to leave in _ for filenames and / and \ for paths + tk = re.sub("[\"\'\,]"," ",tk) + for tk2 in tk.split(" "): + tokens2.append(tk2) + + b=[] + for tk in tokens2: + tk = tk.strip() + if tk.endswith(".c") or tk.endswith(".cpp") or tk.endswith(".cc"): + #If there's a dir path, only use the end filename + #This could be tweaked if the directory structure is part of the software architecture + #e.g. if there are multiple source directories with meaningful names + if tk.rfind("/") != -1: + ntk = tk[tk.rfind("/")+1:] + elif tk.rfind("\\") != -1: + ntk = tk[tk.rfind("\\")+1:] + else: + ntk = tk + b.append(ntk) + + print "source_file_strings tokens:" + #print tokens + print b + + #a better way to do this (if there are multiple) + #would be to sort, uniquify, and then make the name foo.c_and_bar.c + u_gram="" + u_gram_score=0 + if (len(b) > 0): + f = nltk.FreqDist(b) + u_gram = f.most_common(1)[0][0] + u_gram_score = f.most_common(1)[0][1] + + return (u_gram,u_gram_score) + +#common_strings(start,end): +#Return a list of the common strings in the given range +#Uses NLTK to generate a list of unigrams, bigrams, and trigrams (1 word, 2 word phrase, 3 word phrase) +#If the trigram score > 1/2 * bigram score, the most common trigram is used +#If the bigram score > 1/2 * unigram score, the most common bigram is used +#Otherwise the most common unigram (single word is used) +def common_strings(start,end): + CS_THRESHOLD = 6 + sep = "tvlwz" + + tokens = string_range_tokenize(start,end,sep) + + #make a copy since we're going to edit it + u_tokens = tokens + c=0 + while (c= u_gram_score): + if (t_gram_score * 2 >= b_gram_score): + ret = t_str + ret_s = t_gram_score + else: + ret = b_str + ret_s = b_gram_score + else: + ret = u_gram + ret_s = u_gram_score + + #print "%08x - %08x : %s" % (start,end,ret) + + return (ret,ret_s) + +### End of NLP Section ### + + + +#guess_module_names(): +#Use the NLP section (above) to guess the names of modules and add them to the global module list +#Attempts to find common bracket strings (e.g. "[MOD_NAME] Debug print!") +#then source file names (most often left over from calls to assert()) +#then common trigram/bigram/unigrams +#You can tweak the switchover thresholds below. +def guess_module_names(module_list): + #idea - make score threshold based on the size of the module + # (e.g. smaller modules should have a smaller threshold + C_SCORE_THRESHOLD = 3 + S_SCORE_THRESHOLD = 1 + B_SCORE_THRESHOLD = 1 + c=0 + unk_mod=0 + while (c %08x" % (f_start, t) + UGraph.AddEdge(t,f_start) + + #print "s_%#x -> s_%#x" % (f_start,t)," [len = ",get_weight(func_mid, t), "]" + + +def add_node(f): + basicutils.ForEveryXrefToD(f, add_edge) + +def create_snap_cg(): + global UGraph + UGraph= snap.PNGraph.New() + + #Add every function linearly, this makes sure the nodes are in order + basicutils.ForEveryFuncInSeg(".text",UGraph.AddNode) + basicutils.ForEveryFuncInSeg(".text",add_node) + + for NI in UGraph.Nodes(): + print "node id 0x%x with out-degree %d and in-degree %d" %( + NI.GetId(), NI.GetOutDeg(), NI.GetInDeg()) + + return UGraph \ No newline at end of file