Merge branch 'tmr232-master'

2026-01-08 21:07:58 -05:00 · 2019-10-30 15:00:48 -04:00
parent 54246a2ecf a816b65d9c
commit 23c4412561
10 changed files with 963 additions and 501 deletions
--- a/92
+++ b/92
@@ -1,5 +1,5 @@
 ##############################################################################################
-# Copyright 2018 The Johns Hopkins University Applied Physics Laboratory LLC
+# Copyright 2019 The Johns Hopkins University Applied Physics Laboratory LLC
 # All rights reserved.
 # Permission is hereby granted, free of charge, to any person obtaining a copy of this 
 # software and associated documentation files (the "Software"), to deal in the Software 
@@ -16,43 +16,50 @@
 #
 # HAVE A NICE DAY.

-#################################################################################
-####  Local Function Affinity - Detecting Object File Boundaries in IDA Pro  ####
-#################################################################################
+#################################################################
+####  CodeCut - Detecting Object File Boundaries in IDA Pro  ####
+#################################################################

 **** Terminology ****

 I tend to use the term "module" for a set of related functions within a binary
 that came from a single object file.  So you will see the terms "module" and
-"object file" used interchangeabley in the LFA source and documentation.
+"object file" used interchangeabley in the CC source and documentation.

 **** Dependencies ****

-The only dependency is Natural Language Toolkit (NLTK) - which you can get
-from https://www.nltk.org. 
+CodeCut relies on:
+Natural Language Toolkit (NLTK) - https://www.nltk.org
+Snap.py - https://snap.stanford.edu/snappy/ 

 **** Source Files ****

-lfa.py -                 Main program - written to run as a script from IDA
-                         Pro.  By default, will analyze either the .text
-                         section or the full database, and output the files
-                         listed in the "Output Files" section. *** Make sure to
-                         set IDA_VERSION before you start! ***
+cc_main.py -             Main entry point - simply load this up with the 
+                         "File -> Script file..." option in IDA. 

-basicutils<_6x,_7x>.py - provides a version-agnostic API to IDA.  You need to
-                         set a definition in lfa.py to select which version
-                         of IDA you have.
+lfa.py -                 Analysis engine for LFA.
+
+mc.py -		         Analysis engine for MaxCut.
+
+basicutils_7x.py -       Provides an API to IDA - maybe one day we'll get this
+                         ported to Ghidra!

 map_read.py -            For research purposes - compares a ground truth .map
-                         file (from ld) to a .map file from LFA and produces
+                         file (from ld) to a .map file from CC and produces
                         a score.  See RECON slides or the code itself for more
                         info.  You need to add the option -Map=<target>.map to
                         the linker options in a Makefile to get a .map file.

                   The syntax to map_read is:
-                         python map_read.py <ground truth file> <LFA map file>
+                         python map_read.py <ground truth file> <CC map file>

-**** LFA Parameters, Interpolation, and Output ****
+**** MaxCut Parameters ****
+
+  - Right now there is only one parameter for MaxCut, a value for the maximum
+    module size (currently set to 16K).  
+
+
+**** LFA Parameters & Interpolation ****

 A couple areas for research:

@@ -81,34 +88,35 @@ A couple areas for research:

 **** Output Files ****

-LFA produces 4 files:
+CodeCut produces 7 files:

-<target>_lfa_results.csv - Raw score output from LFA, including where edges 
-                           are detected.  Graphs can fairly easily be 
-                           generated in your favorite spreadsheet program.
+<target>_cc_results.csv - Raw score output from LFA and MaxCut, including where 
+                          edges are detected.  Graphs can fairly easily be 
+                          generated in your favorite spreadsheet program.

-<target>_lfa_labels.py - Script that can be used to label your DB with LFA's 
-			 output.  After determining module boundaries, LFA
-                         attempts to guess the name (fun!) by looking at
-                         common strings used by the module.  Y can use this
-                         script as a scratchpad to name unnamed modules as you
-                         determine what they are, or you can also use other
-			 functions in basicutils to change module names later.
+<target>_{lfa,mc}_labels.py - Script that can be used to label your DB with CC's 
+			      output.  After determining module boundaries, CC 
+                              attempts to guess the name (fun!) by looking at
+                              common strings used by the module, for both the 
+                              LFA and MaxCut module lists.  You can use this
+                              script as a scratchpad to name unnamed modules as you
+                              determine what they are, or you can also use other
+			      functions in basicutils to change module names later.

-<target>_lfa_map.map - A .map file similar to the output from the ld.  This is
-                       for the purposes of comparing to a ground truth .map
-                       file to test LFA when you have source code.
+<target>_{lfa,mc}_map.map - A .map file similar to the output from the ld.  This is
+                            for the purposes of comparing to a ground truth .map
+                            file to test CC when you have source code.

-<target>_lfa_mod_graph.gv - a Graphviz graph file of the module relationships
-                            This is a directed graph where a -> b indicates
-                            that a function in module a calls a function in
-                            module b.  This may take a long time to render if
-                            you have a large binary (more than a couple
-                            hundred modules detected).  For smaller binaries
-                            this can pretty clearly communicate the software
-                            architecture immediately.  For larger binaries
-                            this will show you graphically the most heavily
-                            used modules in the binary.
+<target>_{lfa,mc}_mod_graph.gv - a Graphviz graph file of the module relationships
+                                 This is a directed graph where a -> b indicates
+                                 that a function in module a calls a function in
+                                 module b.  This may take a long time to render if
+                                 you have a large binary (more than a couple
+                                 hundred modules detected).  For smaller binaries
+                                 this can pretty clearly communicate the software
+                                 architecture immediately.  For larger binaries
+                                 this will show you graphically the most heavily
+                                 used modules in the binary.

 You can use sfdp to render the graph into a PNG file with a command line like:

--- a/basicutils_6x.py
+++ b/basicutils_6x.py
@@ -22,6 +22,7 @@
 import idc
 import struct
 import idautils
+import idaapi
 import re

 BADADDR = idc.BADADDR
@@ -202,7 +203,7 @@ def GetCanonicalName(f):
 def NameCanonical(f,mod_name,func_name):
 	n = "%s_%s_%08x" % (mod_name,func_name,f)
 	print "Renaming %s to %s\n" % (idc.GetFunctionName(f),n)
-	idc.MakeName(f,n)
+	idaapi.do_name_anyway(f,n)

 #Put function in canonical format when it doesn't have a name, but you know the module name
 def RenameFuncWithAddr(f,s):
--- a/basicutils_7x.py
+++ b/basicutils_7x.py
@@ -18,6 +18,7 @@

 # basicutils - a version-agnostic API for IDA Pro with some (slightly) higher level functionality
 # This is the 7.x version - see basicutils_6x for the 7.x version
+import os

 import ida_bytes
 import ida_funcs
@@ -49,6 +50,12 @@ def GetFunctionName(x):
 def GetInputFile():
 	return idc.get_root_filename()

+def GetIdbFile():
+    return idc.get_idb_path()
+
+def GetRootName():
+    return os.path.join(os.path.dirname(GetIdbFile()), os.path.basename(GetInputFile()))
+
 def NextFunction(x):
 	return idc.get_next_func(x)

@@ -58,7 +65,12 @@ def PrevFunction(x):
 MAX_OPCODE_LEN = 15	
 def PrevInstr(ea):
    # TODO this will return an inst_t type. Need to figure out how to populate it/make workflow happy
-    return ida_ua.decode_prev_insn(ea, ea-MAX_OPCODE_LEN)
+	out=ida_ua.insn_t()
+	ida_ua.decode_prev_insn(out, ea)
+	return out.ea
+	
+def CodeRefsTo(target):
+    return idautils.CodeRefsTo(target,0)

 def ForEveryUniqXrefTo( target, fun ):
    a = 0
@@ -89,6 +101,19 @@ def ForEveryFuncInDb( fun ):
        fun(f)
        f=NextFunction(f)

+def ForEveryFuncInSeg( seg, fun ):
+    start,end = SegByName(".text")
+    if (start == BADADDR):
+        start = NextFunction(0)
+        end = BADADDR
+    f = start
+    while (f < end):
+        """print "ev: %#x" % f"""
+        print f
+        fun(f)
+        f=NextFunction(f)		
+		
+		
 def NFuncUp( fun, n ) :
    i=0
    f=fun
@@ -216,7 +241,7 @@ def GetCanonicalName(f):
 def NameCanonical(f,mod_name,func_name):
    n = "%s_%s_%08x" % (mod_name,func_name,f)
    print "Renaming %s to %s\n" % (idc.get_func_name(f),n)
-    ida_name.set_name(f,n)
+    ida_name.force_name(f,n)

 #Put function in canonical format when it doesn't have a name, but you know the module name    
 def RenameFuncWithAddr(f,s):
@@ -248,7 +273,7 @@ def RenameFuncWithNewMod(f,mod):
    parts = n.split("_")
    new_name = "%s_%s_%08x" % (mod,parts[1],f)
    print "Renaming %s to %s\n" % (n, new_name)
-    ida_name.set_name(f,new_name)	# TODO confirm this works...
+    ida_name.set_name(f,new_name)

 #Rename a module (all functions that start with <mod>_)	
 def RenameMod(orig, new):
--- a/cc_base.py
+++ b/cc_base.py
@@ -0,0 +1,152 @@
+##############################################################################################
+# Copyright 2019 The Johns Hopkins University Applied Physics Laboratory LLC
+# All rights reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+# software and associated documentation files (the "Software"), to deal in the Software 
+# without restriction, including without limitation the rights to use, copy, modify, 
+# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+# permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+# OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# HAVE A NICE DAY.
+
+import basicutils_7x as basicutils
+import json
+import os
+
+## Utilities
+
+#escape_for_graphviz()
+#Return the string escaped for usage in a GraphViz file
+def escape_for_graphviz(string):
+    return json.dumps(string)
+
+## CodeCut Basics
+## A couple of functions for working with function and module lists and outputting results
+
+#locate_module()
+#Return the module information for a given function
+#This assumes that the module list is in order, but not necessarily contiguous
+def locate_module(module_list, f):
+	found=0
+	c=0
+	#print "Finding %08x in module list length: %d" % (f,len(module_list))
+	while ( (found != 1) and (c < len(module_list))):
+		m = module_list[c]
+		#print "\t%x - %x: %s" % (m.start,m.end,m.name)
+		#this is the case where a function falls in the cracks between modules (because it wasn't cool enough to get a score)
+		if (f < m.start):
+			found = 1
+			ret = None
+		elif ((f >= m.start) and (f <= m.end)):
+			found = 1
+			ret = m
+		c+=1
+	return m	
+
+		
+#gen_mod_graph()
+#Output a module-to-module call graph in GraphViz format
+#For each module m_1
+#  For each function <f> in the module
+#    For each function that <f> calls
+#      Lookup the module info for <f> m_2
+#        If it's been assigned a module, add edge m_1 -> m_2 to the graph
+def gen_mod_graph(module_list, suffix):
+	c=0
+	g=set()
+	while (c < len(module_list)):
+		m = module_list[c]
+		f = m.start
+		while (f <= m.end):
+			for xref in basicutils.FuncXrefsFrom(f):
+				target = locate_module(module_list,xref)
+				if (target):
+					g.add((m.name,target.name))
+			f = basicutils.NextFunction(f)
+		c+=1
+
+	root_name = basicutils.GetRootName()
+	file = open(root_name + "_" + suffix + "_mod_graph.gv", "wb")
+	
+	file.write("digraph g {\n")
+	
+	for (node1,node2) in g:
+		line = "%s -> %s\n" % (escape_for_graphviz(node1),escape_for_graphviz(node2))
+		file.write(line)
+		
+	file.write("}\n")
+	file.close()
+
+#gen_rename_script()
+#Output the module list with names as a Python script
+#This script can then be run on the database if in the same directory as the basicutils libraries
+#Look at basicutils.RenameRangeWithAddr to see the "canonical" name format - 
+#  you can also tweak that function to use a different naming convention
+def gen_rename_script(module_list, suffix):
+	c=0
+
+	root_name = basicutils.GetRootName()
+	file = open(root_name + "_" + suffix + "_labels.py", "wb")
+	
+	#if (IDA_VERSION < 7):
+	#	file.write("import basicutils_6x as basicutils\n");
+	#else:
+	file.write("import basicutils_7x as basicutils\n");
+	file.write("\ndef go():\n");
+	
+	while (c<len(module_list)):
+		m=module_list[c]
+		file.write("\tbasicutils.RenameRangeWithAddr(0x%x,0x%x,%r)\n"%(m.start,m.end,m.name))
+		c+=1
+		
+	file.write("\n")
+	file.write("if __name__ == \"__main__\":\n")
+	file.write("\treload(basicutils)\n")
+	file.write("\tgo()\n")
+	file.close()
+
+#gen_map_file()
+#Produce a .map file similar to that produced by the ld option -Map=foo.map
+#Use map_read.py to test accuracy when a ground truth map file is available
+def gen_map_file(module_list, suffix):
+	c=0
+
+	root_name = basicutils.GetRootName()
+	file = open(root_name + "_" + suffix + "_map.map", "wb")
+	
+	while (c<len(module_list)):
+		m=module_list[c]
+		mlen = basicutils.NextFunction(m.end) - m.start 
+		mlen_str = "0x%x" % mlen
+		file.write("%s0x%016x%s %s\n" % (" .text".ljust(16),m.start,mlen_str.rjust(11),m.name))
+		c+=1
+		
+	file.close()
+	
+#print_results():
+#Write all of the results to <target>.csv - which can be opened in your favorite spreadsheet program		
+def print_results(function_list, module_list1, module_list2):
+	c=0
+	root_name = basicutils.GetRootName()
+	file = open(root_name + "_cc_results.csv", "wb")
+	
+	#write header
+	file.write("Function,Function #,LFA Score 1,LFA Score 2,LFA Total,LFA Edge,MC Edge,Function Name,Suggested Mod Name (LFA), Suggested Mod Name(MC)\n");
+	
+	while (c<len(function_list)):
+		f = function_list[c]
+		fname = basicutils.GetFunctionName(f.loc)
+		m1 = locate_module(module_list1, f.loc)
+		m2 = locate_module(module_list2, f.loc)
+		mname1 = m1.name
+		mname2 = m2.name
+		line = "0x%08x, %d , %f, %f, %f, %d, %d, %s, %s, %s\n" % (f.loc,c+1,f.score1, f.score2, f.total_score,f.edge[0],f.edge[1],fname, mname1, mname2)
+		file.write(line)
+		c+=1
--- a/cc_main.py
+++ b/cc_main.py
@@ -0,0 +1,62 @@
+##############################################################################################
+# Copyright 2019 The Johns Hopkins University Applied Physics Laboratory LLC
+# All rights reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+# software and associated documentation files (the "Software"), to deal in the Software 
+# without restriction, including without limitation the rights to use, copy, modify, 
+# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+# permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+# OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# HAVE A NICE DAY.
+
+import maxcut
+import lfa
+import module
+import modnaming
+import cc_base
+import basicutils_7x as basicutils
+import snap_cg
+
+def go():
+	
+	#Do LFA and MaxCut Analysis to find module boundaries
+	lfa_funclist, lfa_modlist = lfa.analyze()
+	merge_flist,maxcut_modlist = maxcut.analyze(lfa_funclist)
+	
+	#Guess names for the modules using NLP
+	lfa_modlist = modnaming.guess_module_names(lfa_modlist)
+	maxcut_modlist = modnaming.guess_module_names(maxcut_modlist)
+	
+	#Output all results as .csv
+	cc_base.print_results(merge_flist, lfa_modlist, maxcut_modlist)
+	
+	#Output module-to-module call graph as a Graphviz .gv file
+	cc_base.gen_mod_graph(lfa_modlist, "lfa")
+	cc_base.gen_mod_graph(maxcut_modlist, "mc")
+	
+	#Output a Python script that will rename modules
+	cc_base.gen_rename_script(lfa_modlist, "lfa")
+	cc_base.gen_rename_script(maxcut_modlist, "mc")
+	
+	#Output .map file (for comparison against ground truth, when available)
+	cc_base.gen_map_file(lfa_modlist, "lfa")
+	cc_base.gen_map_file(maxcut_modlist, "mc")
+
+	return True
+
+if __name__ == "__main__":
+	reload(modnaming)
+	reload(module)
+	reload(cc_base)
+	reload(lfa)
+	reload(maxcut)
+	reload(snap_cg)
+	reload(basicutils)
+	go()
--- a/lfa.py
+++ b/lfa.py
@@ -45,24 +45,14 @@
 # detect 2 or 3 related object files as one file.

 IDA_VERSION = 7
+import basicutils_7x as basicutils

-if (IDA_VERSION < 7):	
-	import idc
-	import struct
-	import idautils
-	import basicutils_6x as basicutils
-else:
-	import ida_idaapi
-	import ida_idc
-	import ida_funcs
-	import ida_nalt
-	import ida_segment
-	import idautils
-	import basicutils_7x as basicutils
-
+#External dependencies
 import math
-import nltk
-import nltk.collocations
+
+#CodeCut dependencies
+import cc_base
+import module

 #Threshold above which a function call is considered "external"
 #For published research - 0x1000 = 4K
@@ -76,352 +66,6 @@ g_function_list = []
 g_module_list = []


-class func_info():
-	def __init__(self,loc,score1,score2):
-		self.loc = loc        #the effective address of the function
-		self.score1=score1    #"Calls from" local function affinity score
-		self.score2=score2    #"Calls to" local function affinity score 
-		self.total_score=score1+score2
-		self.edge=0           #Set by edge_detect() - if 1, this is the start of a new module
-		
-class bin_module():
-	def __init__(self,start,end,score,name):
-		self.start=start
-		self.end=end
-		self.score=score	#Currently unused
-		self.name=name
-
-#locate_module()
-#Return the module information for a given function
-#This assumes that the module list is in order, but not necessarily contiguous
-def locate_module(f):
-	global g_module_list
-	found=0
-	c=0
-	#print "Finding %08x in module list length: %d" % (f,len(g_module_list))
-	while ( (found != 1) and (c < len(g_module_list))):
-		m = g_module_list[c]
-		#print "\t%x - %x: %s" % (m.start,m.end,m.name)
-		#this is the case where a function falls in the cracks between modules (because it wasn't cool enough to get a score)
-		if (f < m.start):
-			found = 1
-			ret = None
-		elif ((f >= m.start) and (f <= m.end)):
-			found = 1
-			ret = m
-		c+=1
-	return m	
-
-#gen_mod_graph()
-#Output a module-to-module call graph in GraphViz format
-#For each module m_1
-#  For each function <f> in the module
-#    For each function that <f> calls
-#      Lookup the module info for <f> m_2
-#        If it's been assigned a module, add edge m_1 -> m_2 to the graph
-def gen_mod_graph():
-	global g_module_list
-	c=0
-	g=set()
-	while (c < len(g_module_list)):
-		m = g_module_list[c]
-		f = m.start
-		while (f <= m.end):
-			for xref in basicutils.FuncXrefsFrom(f):
-				target = locate_module(xref)
-				if (target):
-					g.add((m.name,target.name))
-			f = basicutils.NextFunction(f)
-		c+=1
-
-	root_name = basicutils.GetInputFile()
-	file = open(root_name + "_lfa_mod_graph.gv", "wb")
-	
-	file.write("digraph g {\n")
-	
-	for (node1,node2) in g:
-		line = "%s -> %s\n" % (node1,node2)
-		file.write(line)
-		
-	file.write("}\n")
-	file.close()
-
-#gen_rename_script()
-#Output the module list with names as a Python script
-#This script can then be run on the database if in the same directory as the basicutils libraries
-#Look at basicutils.RenameRangeWithAddr to see the "canonical" name format - 
-#  you can also tweak that function to use a different naming convention
-def gen_rename_script():
-	global g_module_list
-	c=0
-
-	root_name = basicutils.GetInputFile()
-	file = open(root_name + "_lfa_labels.py", "wb")
-	
-	if (IDA_VERSION < 7):
-		file.write("import basicutils_6x as basicutils\n");
-	else:
-		file.write("import basicutils_7x as basicutils\n");
-	file.write("\ndef go():\n");
-	
-	while (c<len(g_module_list)):
-		m=g_module_list[c]
-		file.write("\tbasicutils.RenameRangeWithAddr(0x%x,0x%x,\"%s\")\n"%(m.start,m.end,m.name))
-		c+=1
-		
-	file.write("\n")
-	file.write("if __name__ == \"__main__\":\n")
-	file.write("\treload(basicutils)\n")
-	file.write("\tgo()\n")
-	file.close()
-
-#gen_map_file()
-#Produce a .map file similar to that produced by the ld option -Map=foo.map
-#Use map_read.py to test LFA's accuracy when a ground truth map file is available
-def gen_map_file():
-	global g_module_list
-	c=0
-
-	root_name = basicutils.GetInputFile()
-	file = open(root_name + "_lfa_map.map", "wb")
-	
-	while (c<len(g_module_list)):
-		m=g_module_list[c]
-		mlen = idc.NextFunction(m.end) - m.start 
-		mlen_str = "0x%x" % mlen
-		file.write("%s0x%016x%s %s\n" % (" .text".ljust(16),m.start,mlen_str.rjust(11),m.name))
-		c+=1
-		
-	file.close()
-
-### NLP Section ###
-
-# This section of code attempts to name the modules based on common strings in the string references
-# Not really based on any sound science or anything - your mileage may heavily vary. :-D
-
-#string_range_tokenize(start,end,sep):
-#Compile all string references between start and end as a list of strings (called "tokens")
-# <sep> should be a nonsense word, and will show up in the list
-def string_range_tokenize(start,end,sep):
-	# get all string references in this range concatenated into a single string
-	t = basicutils.CompileTextFromRange(start,end,sep)
-	
-	#Enable this if you already have a bunch of function names and want to include that in the mix
-	#t+= basicutils.CompileFuncNamesFromRangeAsText(start,end,sep)
-	
-	#print "string_range_tokenize: raw text:"
-	#print t
-	#remove printf/sprintf format strings
-	tc = re.sub("%[0-9A-Za-z]+"," ",t)
-	#convert dash to underscore
-	tc = re.sub("-","_",tc)
-	#replace _ and / with space - may want to turn this off sometimes
-	#this will break up snake case and paths
-	#problem is that if you have a path that is used throughout the binary it will probably dominate results
-	tc = re.sub("_"," ",tc)
-	#replace / and \\ with a space
-	tc = re.sub("[/\\\\]"," ",tc)
-	#remove anything except alphanumeric, spaces, . (for .c, .cpp, etc) and _
-	tc = re.sub("[^A-Za-z0-9_\.\s]"," ",tc)
-	
-	#lowercase it - and store this as the original set of tokens to work with
-	tokens = [tk.lower() for tk in tc.split()]
-	
-	#remove English stop words
-	#this is the list from the MIT *bow project
-	eng_stopw = {"about","all","am","an","and","are","as","at","be","been","but","by","can","cannot","did","do","does","doing","done","for","from","had","has","have","having","if","in","is","it","its","of","on","that","the","these","they","this","those","to","too","want","wants","was","what","which","will","with","would"}
-	#remove "code" stop words
-	#e.g. common words in debugging strings
-	code_sw = {"error","err","errlog","log","return","returned","byte","bytes","status","len","length","size","ok","0x","warning","fail","failed","failure","invalid","illegal","param","parameter","done","complete","assert","assertion","cant","didnt","class","foundation","cdecl","stdcall","thiscall"}
-	stopw = eng_stopw.union(code_sw)
-	c = 0
-	
-	tokens_f = []
-	
-	for t in tokens:
-		if t not in stopw:
-			tokens_f.append(t)
-			
-	return tokens_f
-
-#bracket_strings(start,end,b_brack,e_brack):
-#Return the most common string in the range <star,end> that begins with b_brack and ends with e_brack
-#  The count of how many times this string appeared is also returned
-#I find somewhat often people format debug strings like "[MOD_NAME] Function X did Y!"
-#This function is called by guess_module_names() - if you see this format with different brackets
-#you can edit that call
-def bracket_strings(start,end,b_brack,e_brack):
-	sep = "tzvlw"
-	t = basicutils.CompileTextFromRange(start,end,sep)
-	tokens = [tk.lower() for tk in t.split(sep)]
-	
-	b=[]
-	for tk in tokens:
-		tk = tk.strip()
-		
-		if tk.startswith(b_brack) :
-			b_contents = tk[1:tk.find(e_brack)]
-			#Hack to get rid of [-],[+],[*] - could also try to remove non alpha
-			if (len(b_contents) > 3):
-				b.append(tk[1:tk.find(e_brack)])
-			
-	#print "bracket_strings tokens:"
-	#print tokens
-	#print b
-	
-	u_gram=""
-	u_gram_score=0
-	if (len(b) > 0):
-		f = nltk.FreqDist(b)
-		u_gram = f.most_common(1)[0][0]
-		u_gram_score = f.most_common(1)[0][1]
-		
-	return (u_gram,u_gram_score)
-
-#source_file_strings(start,end):
-#Return the most common string that looks like a source file name in the given range
-#  The count of how many times this string appeared is also returned
-def source_file_strings(start,end):
-	sep = "tzvlw"
-	t = basicutils.CompileTextFromRange(start,end,sep)
-	#normally would do lower here to normalize but we lose camel case that way
-	tokens = [tk for tk in t.split(sep)]
-	
-	#for each string, remove quotes and commas, then tokenize based on spaces to generate the final list
-	tokens2=[]
-	for tk in tokens:
-		tk = tk.strip()
-		#strip punctuation, need to leave in _ for filenames and / and \ for paths 
-		tk = re.sub("[\"\'\,]"," ",tk)
-		for tk2 in tk.split(" "):
-			tokens2.append(tk2)
-	
-	b=[]
-	for tk in tokens2:
-		tk = tk.strip()
-		if tk.endswith(".c") or tk.endswith(".cpp") or tk.endswith(".cc"):
-			#If there's a dir path, only use the end filename
-			#This could be tweaked if the directory structure is part of the software architecture
-			#e.g. if there are multiple source directories with meaningful names
-			if tk.rfind("/") != -1:
-				ntk = tk[tk.rfind("/")+1:]
-			elif tk.rfind("\\") != -1:
-				ntk = tk[tk.rfind("\\")+1:]
-			else:
-				ntk = tk
-			b.append(ntk)
-			
-	#print "source_file_strings tokens:"
-	#print tokens
-	#print b
-	
-	#a better way to do this (if there are multiple)
-	#would be to sort, uniquify, and then make the name foo.c_and_bar.c
-	u_gram=""
-	u_gram_score=0
-	if (len(b) > 0):
-		f = nltk.FreqDist(b)
-		u_gram = f.most_common(1)[0][0]
-		u_gram_score = f.most_common(1)[0][1]
-		
-	return (u_gram,u_gram_score)
-	
-#common_strings(start,end):
-#Return a list of the common strings in the given range	
-#Uses NLTK to generate a list of unigrams, bigrams, and trigrams (1 word, 2 word phrase, 3 word phrase)
-#If the trigram score > 1/2 * bigram score, the most common trigram is used
-#If the bigram score > 1/2 * unigram score, the most common bigram is used
-#Otherwise the most common unigram (single word is used)
-def common_strings(start,end):
-	CS_THRESHOLD = 6
-	sep = "tvlwz"
-	
-	tokens = string_range_tokenize(start,end,sep)
-	
-	#make a copy since we're going to edit it
-	u_tokens = tokens
-	c=0
-	while (c<len(u_tokens)):
-		if u_tokens[c] == sep:
-			del u_tokens[c]
-		else:
-			c+=1
-	
-	print "common_strings tokens:"
-	print tokens
-	
-	if len(u_tokens) < CS_THRESHOLD:
-		#print "%08x - %08x : %s" % (start,end,"no string")
-		return ("",0)	
-	
-	f = nltk.FreqDist(u_tokens)
-	u_gram = f.most_common(1)[0][0]
-	u_gram_score = f.most_common(1)[0][1]
-	
-	#print "Tokens:"
-	#print tokens
-	#print len(tokens)
-	
-	bgs = list(nltk.bigrams(tokens))
-	c=0
-	while (c<len(bgs)):
-		if sep in bgs[c]:
-			del bgs[c]
-		else:
-			c+=1
-	
-	#print "Bigrams:"
-	#print bgs
-	if (len(bgs) != 0):
-		fs = nltk.FreqDist(bgs)
-		b_gram = fs.most_common(1)[0][0]
-		#print "Most Common:"
-		#print b_gram
-		b_str = b_gram[0] + "_" + b_gram[1]
-		b_gram_score = fs.most_common(1)[0][1]
-	else:
-		b_str =""
-		b_gram_score = 0
-		
-	tgs = list(nltk.trigrams(tokens))
-	c=0
-	while (c<len(tgs)):
-		if sep in tgs[c]:
-			del tgs[c]
-		else:
-			c+=1
-	#print "Trigrams:"
-	#print tgs
-	if (len(tgs) != 0):
-		ft = nltk.FreqDist(tgs)
-		t_gram = ft.most_common(1)[0][0]
-		t_str = t_gram[0] + "_" + t_gram[1] + "_" + t_gram[2]
-		t_gram_score = ft.most_common(1)[0][1]
-	else:
-		t_str = ""
-		t_gram_score = 0
-		
-	
-	#print "1: %s - %d 2: %s - %d 3: %s - %d\n" % (u_gram,u_gram_score,b_str,b_gram_score,t_str,t_gram_score)
-	
-	if (b_gram_score * 2 >= u_gram_score):
-		if (t_gram_score * 2 >= b_gram_score):
-			ret = t_str
-			ret_s = t_gram_score
-		else:
-			ret = b_str
-			ret_s = b_gram_score
-	else:
-		ret = u_gram
-		ret_s = u_gram_score
-	
-	#print "%08x - %08x : %s" % (start,end,ret)
-	
-	return (ret,ret_s)
-
-### End of NLP Section ###	
-
-
 #func_callers_weight(f):
 #Return the LFA score for functions that this functions calls (i.e. the "calls from" score)
 #If there are no references, return 0
@@ -458,7 +102,7 @@ def func_callee_weight(f):
 	fc = 0
 	fs = 0
 	a = 0
-	for xref in idautils.CodeRefsTo(f,0):
+	for xref in basicutils.CodeRefsTo(f):
 	
 		dist = abs(xref - f)
 		#print "%08x:  %08x %d " % (f, xref, dist),
@@ -519,7 +163,10 @@ def func_call_weight(f_start, f_end):
 			prevscore_2 = 0
 			z1 = 1
 			z2 = 1
-			f = idc.NextFunction(f)
+			finf = module.func_info(f,0,0)
+			finf.lfa_skip=1
+			g_function_list.append(finf)
+			f = basicutils.NextFunction(f)
 			continue
 		
 		#if 1st or 2nd score is zero, interpolate using previous score and an assumed negative linear slope
@@ -543,13 +190,47 @@ def func_call_weight(f_start, f_end):
 		print "0x%08x, %d , %f, %f, %f" % (f, c,score_1, score_2, total_score)
 		
 		#Add scores to the global function score list
-		finf = func_info(f,score_1,score_2)
+		finf = module.func_info(f,score_1,score_2)
+		finf.lfa_skip=0
 		g_function_list.append(finf)
 		
 		line = "0x%08x, %d , %f, %f, %f\n" % (f,c,score_1, score_2, total_score)
 		f=basicutils.NextFunction(f)
 		c+=1
-
+		
+#get_last _three and get_lfa_start:
+#Previously LFA would just skip functions if they had no caller or callee score
+#it would effectively drop them.  This meant that when doing edge detection we
+#knew every function in the function list had a score.  Now we're putting all
+#functions in the function list, and we have a "skip" field if LFA should skip it
+#for scoring purposes.  So these functions help parse that skip field, since for
+#edge detection we look at the previous three scores.
+def get_last_three(index):
+	c=0
+	i = index-1
+	p=[]
+	while ((c<3) and (i>0)):
+		print "get_last_3: %d,%d" % (c,i)
+		if (g_function_list[i].lfa_skip == 0):
+			p.append(g_function_list[i])
+			c+=1
+		i-=1
+	if (c==3):
+		return p[0],p[1],p[2]
+	else:
+		print "Error: could not find 3 scored entries before index: %d  (%d,%d)" % (index, i, c)
+		return 0,0,0
+						
+def get_lfa_start():
+	c=0;
+	i=0;
+	while (c < 4):
+		print "get_lfa_start: %d,%d" % (c,i)
+		if (g_function_list[i].lfa_skip==0):
+			c+=1
+		i+=1
+	return i
+		
 #edge_detect():
 # Determine boundaries between object files
 #  Edge condition is a delta of at least 2 where the current score is positive 
@@ -561,92 +242,44 @@ def edge_detect():
 	#For published research
 	EDGE_THRESHOLD = 2
 	
-	c=3
+	c=get_lfa_start()
 	#do edge detection
 	while (c<len(g_function_list)):
-		p_1 = g_function_list[c-1].total_score
-		p_2 = g_function_list[c-2].total_score
-		p_3 = g_function_list[c-3].total_score
-		s = g_function_list[c].total_score
-		#if score is positive and it is diff of at least 2 from previous
-		#and the previous function was not an edge
-		if ((not g_function_list[c-1].edge == 1) and (s > 0) and ((s - p_1) > EDGE_THRESHOLD)):
-			#if 2 of last 3 were negative
-			m = sorted([p_1,p_2,p_3])
-			if (m[1] < 0):
-				g_function_list[c].edge=1
+		#TODO: this is not working as previously intended
+		#because the last 3 can have "skipped" entries in them
+		if (g_function_list[c].lfa_skip == 0):
+			f_1,f_2,f_3 = get_last_three(c)
+			p_1 = f_1.total_score
+			p_2 = f_2.total_score
+			p_3 = f_3.total_score
+			#p_1 = g_function_list[c-1].total_score
+			#p_2 = g_function_list[c-2].total_score
+			#p_3 = g_function_list[c-3].total_score
+			s = g_function_list[c].total_score
+			#if score is positive and it is diff of at least 2 from previous
+			#and the previous function was not an edge
+			if ((not f_1.edge[0] == 1) and (s > 0) and ((s - p_1) > EDGE_THRESHOLD)):
+				#if 2 of last 3 were negative
+				m = sorted([p_1,p_2,p_3])
+				if (m[1] < 0):
+					g_function_list[c].edge[0]=1
 		c+=1
 	#assign modules based on where the edges are
 	c=0
 	mod_start = g_function_list[0].loc
 	while(c<len(g_function_list)):
 		f = g_function_list[c]
-		if (f.edge == 1):
+		if (f.edge[0] == 1):
 			p = g_function_list[c-1]
-			b_mod = bin_module(mod_start,p.loc,0,"")
-			mod_start = f.loc
+			b_mod = module.bin_module(mod_start,p.loc,0,"")
+			mod_start = f.loc #set the start of the next module to this function (where edge was detected)
 			g_module_list.append(b_mod)
 		c+=1

-#guess_module_names():
-#Use the NLP section (above) to guess the names of modules and add them to the global module list
-#Attempts to find common bracket strings (e.g. "[MOD_NAME] Debug print!")
-#then source file names (most often left over from calls to assert())
-#then common trigram/bigram/unigrams
-#You can tweak the switchover thresholds below.
-def guess_module_names():
-	#idea - make score threshold based on the size of the module
-	# (e.g. smaller modules should have a smaller threshold
-	global g_module_list
-	C_SCORE_THRESHOLD = 3
-	S_SCORE_THRESHOLD = 1
-	B_SCORE_THRESHOLD = 1
-	c=0
-	unk_mod=0
-	while (c<len(g_module_list)):
-		m = g_module_list[c]
-		# first look for strings that start with [FOO], (bracket strings)
-		# then look for strings that contain source files (.c,.cpp,etc.)
-		# then try common strings
-		# above thresholds can be tweaked - they represent the number of strings that have to be repeated
-		# in order to use that string as the module name
-		(name,scr) = bracket_strings(m.start,m.end,"[","]")
-		if (scr < B_SCORE_THRESHOLD):
-			(name,scr) = source_file_strings(m.start,m.end)
-			if (scr < S_SCORE_THRESHOLD):
-				(name,scr) = common_strings(m.start,m.end)
-				if (scr < C_SCORE_THRESHOLD):
-					#Couldn't come up with a name so name it umod1, umod2, etc.
-					name = "umod%d" % (unk_mod)
-					#"word cloud" or something to get an idea of what the module is
-					#print basicutils.CompileTextFromRange(m.start,m.end," ")
-					unk_mod+=1
-		g_module_list[c].name = name
-		g_module_list[c].score = scr
-		print "%08x - %08x : %s (%d)" % (m.start,m.end,name,scr)
-		c+=1
-		
-#print_results():
-#Write all of the results to <target>.csv - which can be opened in your favorite spreadsheet program		
-def print_results():
+#Main entry point - returns an LFA module list and a global function list (with the LFA module edges marked)	
+def analyze():
 	global g_function_list
-	c=0
-	root_name = basicutils.GetInputFile()
-	file = open(root_name + "_lfa_results.csv", "wb")
-	
-	#write header
-	file.write("Function,Function #,Score 1,Score 2,Total,Edge,Function Name,Suggested Module Name\n");
-	
-	while (c<len(g_function_list)):
-		f = g_function_list[c]
-		fname = basicutils.GetFunctionName(f.loc)
-		m = locate_module(f.loc)
-		mname = m.name 
-		line = "0x%08x, %d , %f, %f, %f, %d, %s, %s\n" % (f.loc,c+1,f.score1, f.score2, f.total_score,f.edge, fname, mname)
-		file.write(line)
-		c+=1
-	
-def go():
+	global g_module_list

 	#Define range to analyze
 	#just do .text segment if we've got one
@@ -660,19 +293,5 @@ def go():
 	func_call_weight(start,end)
 	#Detect edges - object file boundaries
 	edge_detect()
-	#Guess names for the modules using NLP
-	guess_module_names()
-	#Output all results as .csv
-	print_results()
-	#Output module-to-module call graph as a Graphviz .gv file
-	gen_mod_graph()
-	#Output a Python script that will rename modules
-	gen_rename_script()
-	#Output .map file (for comparison against ground truth, when available)
-	gen_map_file()
-
-	return True
-
-if __name__ == "__main__":
-	reload(basicutils)
-	go()
+	
+	return g_function_list, g_module_list
--- a/maxcut.py
+++ b/maxcut.py
@@ -0,0 +1,182 @@
+##############################################################################################
+# Copyright 2019 The Johns Hopkins University Applied Physics Laboratory LLC
+# All rights reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+# software and associated documentation files (the "Software"), to deal in the Software 
+# without restriction, including without limitation the rights to use, copy, modify, 
+# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+# permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+# OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# HAVE A NICE DAY.
+
+###############################################################
+###  Object File Boundary Detection in IDA Pro with MaxCut  ###
+###############################################################
+
+import snap
+import sys
+import snap_cg
+import module
+
+g_maxcut_modlist = []
+
+#make_subgraph()
+#returns a Snap subgraph for just the address region specified
+#(i.e. the subgraph will not have any edges that originate outside the region
+#or terminate outside the region)
+
+def make_subgraph(region_start,region_end, graph):
+	print "make_subgraph: start: 0x%x and end: 0x%x" % (region_start,region_end)
+	NIdV = snap.TIntV()
+	#this would be much faster if we had a linear list of functions (nodes)
+	for Node in graph.Nodes():
+		start = Node.GetId()
+		if (start >= region_start) and (start <= region_end):
+			NIdV.Add(start)
+		if (start > region_end):
+			break
+	return snap.GetSubGraph(graph, NIdV)
+
+#make_cut()
+#This function analyzes the region specified and returns the cut address for the address with the 
+#maximum score, i.e. the address that has the highest average distance call length of function calls 
+#that go across the address.  If multiple addresses with zero calls are found (inf. score) the one 
+#closest to the middle of the region is returned.  	
+def make_cut(region_start, region_end, graph):
+
+	print "make_cut: start: 0x%x end: 0x%x" % (region_start,region_end)
+
+	weight = {}
+	z = 0
+	zeroes = []
+	for Node in graph.Nodes():
+		start = Node.GetId() 
+		#iterate only over nodes in this region
+		cut_address = start - 1
+		if cut_address < region_start:
+			continue
+			
+		weight[cut_address] = 0
+		edge_count = 0
+
+		for Edge in graph.Edges():
+			edge_start = Edge.GetSrcNId()
+			edge_end = Edge.GetDstNId()
+			#only look at edges that cross the possible cut address
+			#handle both cases for the directed graph
+			if (edge_start < cut_address and edge_end > cut_address) or (edge_end < cut_address and edge_start > cut_address):
+				#print "      cut %x, %x to %x cross" % (cut_address,edge_start,edge_end)
+				weight[cut_address] += abs(edge_end - edge_start)
+				edge_count +=1
+			
+		#If we have a place where we have no edges crossing - keep track of it
+		#We will pick the place closest to the center of the module
+		if edge_count == 0:
+			print "  returning 0 weight count at: 0x%0x" % cut_address
+			z+=1
+			zeroes.append(cut_address)
+			weight[cut_address] = 0
+		else:
+			weight[cut_address] = weight[cut_address]/ edge_count
+			#print "w: %x: %x" % (cut_address, weight[cut_address])
+
+	#if we had edges with zero crossings, pick the one closest to the center	
+	if (z > 0):
+		print "  total of %d zero weight counts" % (z)
+		center = region_start + ((region_end-region_start)/2)
+		min_dist = sys.maxint
+		for i in xrange(z):
+			dist = abs(center - zeroes[i])
+			if dist < min_dist:
+				min_dist = dist
+				min_zero = zeroes[i]
+		print "  returning zero cut at addr: %x" % min_zero
+		return min_zero
+		
+	#otherwise pick the edge with the maximum weight score
+	max_weight=0
+	#print "   weight table:"
+	for addr,w in weight.iteritems():
+		#print "      %x: %x" % (addr,w)
+		if w > max_weight:
+			max_addr = addr
+			max_weight = w
+
+	print "   returning max weight: %x at addr: 0x%x" % (max_weight,max_addr)
+	return max_addr
+
+#do_cutting()
+#This is the main recursive algorithm for MaxCut
+#Find a cut address, split the graph into two subgraphs, and recurse on those subgraphs
+#Stop if the area being cut is below a particular threshold	
+def do_cutting(start, end, graph):
+	nodes = graph.GetNodes()
+	print "do_cutting: start: 0x%x end: 0x%x nodes: 0x%x" % (start, end, nodes)
+	#set this way for simple
+	#THRESHOLD = 0x100
+	#THRESHOLD = 0x1000
+	THRESHOLD = 0x4000
+	
+	if (end - start > THRESHOLD) and (nodes > 1):
+		cut_address = make_cut(start, end,graph)
+
+		graph1 = make_subgraph(start,cut_address,graph)
+		graph2 = make_subgraph(cut_address+1,end,graph)
+
+		do_cutting(start,cut_address,graph1)
+		do_cutting(cut_address+1,end,graph2)
+	else:
+		print "Module 0x%x to 0x%x" % (start, end)
+		b_mod = module.bin_module(start,end,0,"")
+		g_maxcut_modlist.append(b_mod)
+
+#func_list_annotate()
+#This function copies our list of modules into the function list
+#This allows us to have a single function list with modules from multiple algorithms (LFA and MaxCut)
+def func_list_annotate(flist):
+	c=0
+	m=0
+	while (m < len(g_maxcut_modlist)):
+		start = g_maxcut_modlist[m].start
+		while (flist[c].loc < start):
+			#print "F: %08x M: %08x" % (flist[c].loc, start)
+			c+=1
+			if (c == len(flist)):
+				print "Error: Maxcut module list does not reconcile with function list"
+				return None
+		flist[c].edge[1]=1
+		#print "MC: Set %08x func edge to 1" % flist[c].loc
+		m+=1
+	return flist
+
+#Main entry point
+#Returns a global function list (annotated with MaxCut edges) and a global module list
+def analyze(flist):		
+
+	sys.setrecursionlimit(5000)
+	UGraph = snap_cg.create_snap_cg()
+
+	g_min_node=sys.maxint
+	g_max_node=0
+
+	for Node in UGraph.Nodes():
+		id = Node.GetId()
+		if id < g_min_node:
+			g_min_node = id
+		if id > g_max_node:
+			g_max_node = id
+ 
+	do_cutting(g_min_node,g_max_node, UGraph)
+	
+	r_flist = func_list_annotate(flist)
+	
+	return r_flist,g_maxcut_modlist
+
+
--- a/modnaming.py
+++ b/modnaming.py
@@ -0,0 +1,309 @@
+##############################################################################################
+# Copyright 2018 The Johns Hopkins University Applied Physics Laboratory LLC
+# All rights reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+# software and associated documentation files (the "Software"), to deal in the Software 
+# without restriction, including without limitation the rights to use, copy, modify, 
+# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+# permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+# OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# HAVE A NICE DAY.
+
+IDA_VERSION = 7
+
+if (IDA_VERSION < 7):	
+	import idc
+	import struct
+	import idautils
+	import basicutils_6x as basicutils
+else:
+	import ida_idaapi
+	import ida_idc
+	import ida_funcs
+	import ida_nalt
+	import ida_segment
+	import idautils
+	import basicutils_7x as basicutils
+
+import math
+import nltk
+import nltk.collocations
+import re
+
+
+### NLP Section ###
+
+# This section of code attempts to name the modules based on common strings in the string references
+# Not really based on any sound science or anything - your mileage may heavily vary. :-D
+
+#string_range_tokenize(start,end,sep):
+#Compile all string references between start and end as a list of strings (called "tokens")
+# <sep> should be a nonsense word, and will show up in the list
+def string_range_tokenize(start,end,sep):
+	# get all string references in this range concatenated into a single string
+	t = basicutils.CompileTextFromRange(start,end,sep)
+	
+	#Enable this if you already have a bunch of function names and want to include that in the mix
+	#t+= basicutils.CompileFuncNamesFromRangeAsText(start,end,sep)
+	
+	#print "string_range_tokenize: raw text:"
+	#print t
+	#remove printf/sprintf format strings
+	tc = re.sub("%[0-9A-Za-z]+"," ",t)
+	#convert dash to underscore
+	tc = re.sub("-","_",tc)
+	#replace _ and / with space - may want to turn this off sometimes
+	#this will break up snake case and paths
+	#problem is that if you have a path that is used throughout the binary it will probably dominate results
+	tc = re.sub("_"," ",tc)
+	#replace / and \\ with a space
+	tc = re.sub("[/\\\\]"," ",tc)
+	#remove anything except alphanumeric, spaces, . (for .c, .cpp, etc) and _
+	tc = re.sub("[^A-Za-z0-9_\.\s]"," ",tc)
+	
+	#lowercase it - and store this as the original set of tokens to work with
+	tokens = [tk.lower() for tk in tc.split()]
+	
+	#remove English stop words
+	#this is the list from the MIT *bow project
+	eng_stopw = {"about","all","am","an","and","are","as","at","be","been","but","by","can","cannot","did","do","does","doing","done","for","from","had","has","have","having","if","in","is","it","its","of","on","that","the","these","they","this","those","to","too","want","wants","was","what","which","will","with","would"}
+	#remove "code" stop words
+	#e.g. common words in debugging strings
+	code_sw = {"error","err","errlog","log","return","returned","byte","bytes","status","len","length","size","ok","0x","warning","fail","failed","failure","invalid","illegal","param","parameter","done","complete","assert","assertion","cant","didnt","class","foundation","cdecl","stdcall","thiscall"}
+	stopw = eng_stopw.union(code_sw)
+	c = 0
+	
+	tokens_f = []
+	
+	for t in tokens:
+		if t not in stopw:
+			tokens_f.append(t)
+			
+	return tokens_f
+
+#bracket_strings(start,end,b_brack,e_brack):
+#Return the most common string in the range <star,end> that begins with b_brack and ends with e_brack
+#  The count of how many times this string appeared is also returned
+#I find somewhat often people format debug strings like "[MOD_NAME] Function X did Y!"
+#This function is called by guess_module_names() - if you see this format with different brackets
+#you can edit that call
+def bracket_strings(start,end,b_brack,e_brack):
+	sep = "tzvlw"
+	t = basicutils.CompileTextFromRange(start,end,sep)
+	tokens = [tk.lower() for tk in t.split(sep)]
+	
+	b=[]
+	for tk in tokens:
+		tk = tk.strip()
+		
+		if tk.startswith(b_brack) :
+			b_contents = tk[1:tk.find(e_brack)]
+			#Hack to get rid of [-],[+],[*] - could also try to remove non alpha
+			if (len(b_contents) > 3):
+				#Hack for debug prints that started with [0x%x]
+				if (b_contents != "0x%x"):
+					b.append(tk[1:tk.find(e_brack)])
+			
+	print "bracket_strings tokens:"
+	print tokens
+	print b
+	
+	u_gram=""
+	u_gram_score=0
+	if (len(b) > 0):
+		f = nltk.FreqDist(b)
+		u_gram = f.most_common(1)[0][0]
+		u_gram_score = f.most_common(1)[0][1]
+		
+	return (u_gram,u_gram_score)
+
+#source_file_strings(start,end):
+#Return the most common string that looks like a source file name in the given range
+#  The count of how many times this string appeared is also returned
+def source_file_strings(start,end):
+	sep = "tzvlw"
+	t = basicutils.CompileTextFromRange(start,end,sep)
+	#normally would do lower here to normalize but we lose camel case that way
+	tokens = [tk for tk in t.split(sep)]
+	
+	#for each string, remove quotes and commas, then tokenize based on spaces to generate the final list
+	tokens2=[]
+	for tk in tokens:
+		tk = tk.strip()
+		#strip punctuation, need to leave in _ for filenames and / and \ for paths 
+		tk = re.sub("[\"\'\,]"," ",tk)
+		for tk2 in tk.split(" "):
+			tokens2.append(tk2)
+	
+	b=[]
+	for tk in tokens2:
+		tk = tk.strip()
+		if tk.endswith(".c") or tk.endswith(".cpp") or tk.endswith(".cc"):
+			#If there's a dir path, only use the end filename
+			#This could be tweaked if the directory structure is part of the software architecture
+			#e.g. if there are multiple source directories with meaningful names
+			if tk.rfind("/") != -1:
+				ntk = tk[tk.rfind("/")+1:]
+			elif tk.rfind("\\") != -1:
+				ntk = tk[tk.rfind("\\")+1:]
+			else:
+				ntk = tk
+			b.append(ntk)
+			
+	print "source_file_strings tokens:"
+	#print tokens
+	print b
+	
+	#a better way to do this (if there are multiple)
+	#would be to sort, uniquify, and then make the name foo.c_and_bar.c
+	u_gram=""
+	u_gram_score=0
+	if (len(b) > 0):
+		f = nltk.FreqDist(b)
+		u_gram = f.most_common(1)[0][0]
+		u_gram_score = f.most_common(1)[0][1]
+		
+	return (u_gram,u_gram_score)
+	
+#common_strings(start,end):
+#Return a list of the common strings in the given range	
+#Uses NLTK to generate a list of unigrams, bigrams, and trigrams (1 word, 2 word phrase, 3 word phrase)
+#If the trigram score > 1/2 * bigram score, the most common trigram is used
+#If the bigram score > 1/2 * unigram score, the most common bigram is used
+#Otherwise the most common unigram (single word is used)
+def common_strings(start,end):
+	CS_THRESHOLD = 6
+	sep = "tvlwz"
+	
+	tokens = string_range_tokenize(start,end,sep)
+	
+	#make a copy since we're going to edit it
+	u_tokens = tokens
+	c=0
+	while (c<len(u_tokens)):
+		if u_tokens[c] == sep:
+			del u_tokens[c]
+		else:
+			c+=1
+	
+	print "common_strings tokens:"
+	print tokens
+	
+	if len(u_tokens) < CS_THRESHOLD:
+		#print "%08x - %08x : %s" % (start,end,"no string")
+		return ("",0)	
+	
+	f = nltk.FreqDist(u_tokens)
+	u_gram = f.most_common(1)[0][0]
+	u_gram_score = f.most_common(1)[0][1]
+	
+	#print "Tokens:"
+	#print tokens
+	#print len(tokens)
+	
+	bgs = list(nltk.bigrams(tokens))
+	c=0
+	while (c<len(bgs)):
+		if sep in bgs[c]:
+			del bgs[c]
+		else:
+			c+=1
+	
+	#print "Bigrams:"
+	#print bgs
+	if (len(bgs) != 0):
+		fs = nltk.FreqDist(bgs)
+		b_gram = fs.most_common(1)[0][0]
+		#print "Most Common:"
+		#print b_gram
+		b_str = b_gram[0] + "_" + b_gram[1]
+		b_gram_score = fs.most_common(1)[0][1]
+	else:
+		b_str =""
+		b_gram_score = 0
+		
+	tgs = list(nltk.trigrams(tokens))
+	c=0
+	while (c<len(tgs)):
+		if sep in tgs[c]:
+			del tgs[c]
+		else:
+			c+=1
+	#print "Trigrams:"
+	#print tgs
+	if (len(tgs) != 0):
+		ft = nltk.FreqDist(tgs)
+		t_gram = ft.most_common(1)[0][0]
+		t_str = t_gram[0] + "_" + t_gram[1] + "_" + t_gram[2]
+		t_gram_score = ft.most_common(1)[0][1]
+	else:
+		t_str = ""
+		t_gram_score = 0
+		
+	
+	#print "1: %s - %d 2: %s - %d 3: %s - %d\n" % (u_gram,u_gram_score,b_str,b_gram_score,t_str,t_gram_score)
+	
+	if (b_gram_score * 2 >= u_gram_score):
+		if (t_gram_score * 2 >= b_gram_score):
+			ret = t_str
+			ret_s = t_gram_score
+		else:
+			ret = b_str
+			ret_s = b_gram_score
+	else:
+		ret = u_gram
+		ret_s = u_gram_score
+	
+	#print "%08x - %08x : %s" % (start,end,ret)
+	
+	return (ret,ret_s)
+
+### End of NLP Section ###	
+
+
+
+#guess_module_names():
+#Use the NLP section (above) to guess the names of modules and add them to the global module list
+#Attempts to find common bracket strings (e.g. "[MOD_NAME] Debug print!")
+#then source file names (most often left over from calls to assert())
+#then common trigram/bigram/unigrams
+#You can tweak the switchover thresholds below.
+def guess_module_names(module_list):
+	#idea - make score threshold based on the size of the module
+	# (e.g. smaller modules should have a smaller threshold
+	C_SCORE_THRESHOLD = 3
+	S_SCORE_THRESHOLD = 1
+	B_SCORE_THRESHOLD = 1
+	c=0
+	unk_mod=0
+	while (c<len(module_list)):
+		m = module_list[c]
+		# first look for strings that start with [FOO], (bracket strings)
+		# then look for strings that contain source files (.c,.cpp,etc.)
+		# then try common strings
+		# above thresholds can be tweaked - they represent the number of strings that have to be repeated
+		# in order to use that string as the module name
+		(name,scr) = bracket_strings(m.start,m.end,"[","]")
+		if (scr < B_SCORE_THRESHOLD):
+			(name,scr) = source_file_strings(m.start,m.end)
+			if (scr < S_SCORE_THRESHOLD):
+				(name,scr) = common_strings(m.start,m.end)
+				if (scr < C_SCORE_THRESHOLD):
+					#Couldn't come up with a name so name it umod1, umod2, etc.
+					name = "umod%d" % (unk_mod)
+					#"word cloud" or something to get an idea of what the module is
+					#print basicutils.CompileTextFromRange(m.start,m.end," ")
+					unk_mod+=1
+		module_list[c].name = name
+		module_list[c].score = scr
+		print "%08x - %08x : %s (%d)" % (m.start,m.end,name,scr)
+		c+=1
+		
+	return module_list
--- a/module.py
+++ b/module.py
@@ -0,0 +1,37 @@
+##############################################################################################
+# Copyright 2019 The Johns Hopkins University Applied Physics Laboratory LLC
+# All rights reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+# software and associated documentation files (the "Software"), to deal in the Software 
+# without restriction, including without limitation the rights to use, copy, modify, 
+# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+# permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+# OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# HAVE A NICE DAY.
+
+#This represents the information we want to record about an individual function
+#The function lists returned by LFA and MaxCut are made up of these
+class func_info():
+	def __init__(self,loc,score1,score2):
+		self.loc = loc        #the effective address of the function
+		self.score1=score1    #"Calls from" local function affinity score
+		self.score2=score2    #"Calls to" local function affinity score 
+		self.total_score=score1+score2
+		self.lfa_skip=0				#Set to 1 if "skipped" (not scored) by LFA
+		self.edge=[0,0]         #Set by edge_detect() - if 1, this is the start of a new module
+								#index 0 for LFA, 1 for MaxCut 
+
+#This represents the object files (aka modules) identified by LFA and MaxCut	
+class bin_module():
+	def __init__(self,start,end,score,name):
+		self.start=start
+		self.end=end
+		self.score=score	#Currently unused
+		self.name=name
--- a/snap_cg.py
+++ b/snap_cg.py
@@ -0,0 +1,67 @@
+##############################################################################################
+# Copyright 2019 The Johns Hopkins University Applied Physics Laboratory LLC
+# All rights reserved.
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+# software and associated documentation files (the "Software"), to deal in the Software 
+# without restriction, including without limitation the rights to use, copy, modify, 
+# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+# permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+# OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# HAVE A NICE DAY.
+
+## This code creates a Snap PNGraph object that represents the call graph of a binary
+## (the .text section)
+
+import snap
+import sys
+
+import idc
+import struct
+import idautils
+import basicutils_7x as basicutils
+
+MAX_DIST = 0
+
+
+UGraph = []
+
+def add_edge(f, t):
+	global UGraph
+	n = basicutils.GetFunctionName(f)
+	if n != "":
+		#since we're only doing one edge for each xref, we'll do weight based on distance from the middle of the caller to the callee
+		f_start = idc.GetFunctionAttr(f,idc.FUNCATTR_START)
+		
+		if (not UGraph.IsNode(f_start)):
+			print "Error: had to add node (to): %08x" % f_start
+			UGraph.AddNode(f_start)
+		
+		print "%08x -> %08x" % (f_start, t)
+		UGraph.AddEdge(t,f_start)
+		
+		#print "s_%#x -> s_%#x" % (f_start,t)," [len = ",get_weight(func_mid, t), "]"
+
+
+def add_node(f):
+	basicutils.ForEveryXrefToD(f, add_edge)
+	
+def create_snap_cg():
+	global UGraph
+	UGraph= snap.PNGraph.New()
+	
+	#Add every function linearly, this makes sure the nodes are in order
+	basicutils.ForEveryFuncInSeg(".text",UGraph.AddNode)
+	basicutils.ForEveryFuncInSeg(".text",add_node)
+	
+	for NI in UGraph.Nodes():
+		print "node id 0x%x with out-degree %d and in-degree %d" %(
+			NI.GetId(), NI.GetOutDeg(), NI.GetInDeg())
+	
+	return UGraph