pyghidra changes, not including modnaming refactor

2026-01-07 20:43:54 -05:00 · 2025-09-26 13:24:50 -04:00
parent bbc19924e3
commit 8d26ba6e0b
11 changed files with 302 additions and 671 deletions
--- a/codecut-gui/ghidra_scripts/ModNamingRun.py
+++ b/codecut-gui/ghidra_scripts/ModNamingRun.py
@@ -1,5 +1,5 @@
 # @category CodeCut
-# @menupath CodeCut.DeepCut (Run)
+# @menupath CodeCut.ModNaming (Run)
 # @toolbar codecut.png
 # @runtime PyGhidra

--- a/codecut-gui/ghidra_scripts/modnaming.py
+++ b/codecut-gui/ghidra_scripts/modnaming.py
@@ -43,33 +43,33 @@ def debug_print(x):
 #string_range_tokenize(t):
 #Take a long string and convert it into a list of tokens.  If using a separator, this will appear in the token list
 def string_range_tokenize(t):
-	
-	#print "string_range_tokenize: raw text:"
-	#print t
-	#remove printf/sprintf format strings
-	#tc = re.sub("%[0-9A-Za-z]+"," ",t)
-	#convert dash to underscore
-	#tc = re.sub("-","_",tc)
-	#replace _ and / with space - may want to turn this off sometimes
-	#this will break up snake case and paths
-	#problem is that if you have a path that is used throughout the binary it will probably dominate results
-	#tc = re.sub("_"," ",tc)
-	#replace / and \\ with a space
-	#tc = re.sub("[/\\\\]"," ",tc)
-	#remove anything except alphanumeric, spaces, . (for .c, .cpp, etc) and _
-	#tc = re.sub("[^A-Za-z0-9_\.\s]"," ",tc)
-	
-	#lowercase it - and store this as the original set of tokens to work with
-	tokens = [tk.lower() for tk in t.split()]
-	
-	#remove English stop words
-	#this is the list from the MIT *bow project
-	eng_stopw = {"about","all","am","an","and","are","as","at","be","been","but","by","can","cannot","did","do","does","doing","done","for","from","had","has","have","having","if","in","is","it","its","of","on","that","the","these","they","this","those","to","too","want","wants","was","what","which","will","with","would"}
-	#remove "code" stop words
-	#e.g. common words in debugging strings
-	code_sw =  {"error","err","errlog","log","return","returned","byte","bytes","status","len","length","size","ok","0x","warning","fail","failed","failure","invalid","illegal","param","parameter","done","complete","assert","assertion","cant","didnt","class","foundation","cdecl","stdcall","thiscall"}
-	#remove code stop words (from Joxean Koret's "IDAMagicStrings")	
-	jk_sw = {"copyright", "char", "bool", "int", "unsigned", "long",
+    
+    #print "string_range_tokenize: raw text:"
+    #print t
+    #remove printf/sprintf format strings
+    #tc = re.sub("%[0-9A-Za-z]+"," ",t)
+    #convert dash to underscore
+    #tc = re.sub("-","_",tc)
+    #replace _ and / with space - may want to turn this off sometimes
+    #this will break up snake case and paths
+    #problem is that if you have a path that is used throughout the binary it will probably dominate results
+    #tc = re.sub("_"," ",tc)
+    #replace / and \\ with a space
+    #tc = re.sub("[/\\\\]"," ",tc)
+    #remove anything except alphanumeric, spaces, . (for .c, .cpp, etc) and _
+    #tc = re.sub("[^A-Za-z0-9_\.\s]"," ",tc)
+    
+    #lowercase it - and store this as the original set of tokens to work with
+    tokens = [tk.lower() for tk in t.split()]
+    
+    #remove English stop words
+    #this is the list from the MIT *bow project
+    eng_stopw = {"about","all","am","an","and","are","as","at","be","been","but","by","can","cannot","did","do","does","doing","done","for","from","had","has","have","having","if","in","is","it","its","of","on","that","the","these","they","this","those","to","too","want","wants","was","what","which","will","with","would"}
+    #remove "code" stop words
+    #e.g. common words in debugging strings
+    code_sw =  {"error","err","errlog","log","return","returned","byte","bytes","status","len","length","size","ok","0x","warning","fail","failed","failure","invalid","illegal","param","parameter","done","complete","assert","assertion","cant","didnt","class","foundation","cdecl","stdcall","thiscall"}
+    #remove code stop words (from Joxean Koret's "IDAMagicStrings")    
+    jk_sw = {"copyright", "char", "bool", "int", "unsigned", "long",
  "double", "float", "signed", "license", "version", "cannot", "error",
  "invalid", "null", "warning", "general", "argument", "written", "report",
  "failed", "assert", "object", "integer", "unknown", "localhost", "native",
@@ -85,18 +85,18 @@ def string_range_tokenize(t):
  "corrupted", "default", "success", "expecting", "missing", "phrase", 
  "unrecognized", "undefined"}

-	stopw = eng_stopw.union(code_sw)
-	stopw = stopw.union(jk_sw)
+    stopw = eng_stopw.union(code_sw)
+    stopw = stopw.union(jk_sw)

-	c = 0
-	
-	tokens_f = []
-	
-	for t in tokens:
-		if t not in stopw:
-			tokens_f.append(t)
-			
-	return tokens_f
+    c = 0
+    
+    tokens_f = []
+    
+    for t in tokens:
+        if t not in stopw:
+            tokens_f.append(t)
+            
+    return tokens_f

 #bracket_strings(t,b_brack,e_brack):
 #Return the most common string in the text that begins with b_brack and ends with e_brack
@@ -105,36 +105,36 @@ def string_range_tokenize(t):
 #This function is called by guess_module_names() - if you see this format with different brackets
 #you can edit that call
 def bracket_strings(t, b_brack,e_brack, sep):
-	#sep = "tzvlw"
-	#t = basicutils.CompileTextFromRange(start,end,sep)
-	tokens = [tk.lower() for tk in t.split(sep)]
-	#don't want to use tokenize here because it removes brackets
-	
-	b=[]
-	for tk in tokens:
-		tk = tk.strip()
-		
-		if tk.startswith(b_brack) :
-			b_contents = tk[1:tk.find(e_brack)]
-			#print("found bracket string, content: %s" % b_contents)
-			#Hack to get rid of [-],[+],[*] - could also try to remove non alpha
-			if (len(b_contents) > 3):
-				#Hack for debug prints that started with [0x%x]
-				if (b_contents != "0x%x"):
-					b.append(b_contents)
-			
-	debug_print("bracket_strings tokens:")
-	debug_print(tokens)
-	debug_print(b)
-	
-	u_gram=""
-	u_gram_score=0
-	if (len(b) > 0):
-		f = nltk.FreqDist(b)
-		u_gram = f.most_common(1)[0][0]
-		u_gram_score = f.most_common(1)[0][1]
-		
-	return (u_gram,u_gram_score)
+    #sep = "tzvlw"
+    #t = basicutils.CompileTextFromRange(start,end,sep)
+    tokens = [tk.lower() for tk in t.split(sep)]
+    #don't want to use tokenize here because it removes brackets
+    
+    b=[]
+    for tk in tokens:
+        tk = tk.strip()
+        
+        if tk.startswith(b_brack):
+            b_contents = tk[1:tk.find(e_brack)] if e_brack in tk else tk[1:]
+            #print("found bracket string, content: %s" % b_contents)
+            #Hack to get rid of [-],[+],[*] - could also try to remove non alpha
+            if (len(b_contents) > 3):
+                #Hack for debug prints that started with [0x%x]
+                if (b_contents != "0x%x"):
+                    b.append(b_contents)
+            
+    debug_print("bracket_strings tokens:")
+    debug_print(tokens)
+    debug_print(b)
+    
+    u_gram=""
+    u_gram_score=0
+    if (len(b) > 0):
+        f = nltk.FreqDist(b)
+        u_gram = f.most_common(1)[0][0]
+        u_gram_score = f.most_common(1)[0][1]
+        
+    return (u_gram,u_gram_score)

 #is_source_file_str(f):
 #return True if the file string ends with one of the source file extensions
@@ -159,53 +159,53 @@ def is_source_file_str(f):
 #Return the most common string that looks like a source file name in the given text string
 #  The count of how many times this string appeared is also returned
 def source_file_strings(t, sep):
-	#sep = "tzvlw"
-	#t = basicutils.CompileTextFromRange(start,end,sep)
-	#normally would do lower here to normalize but we lose camel case that way
-	tokens = [tk for tk in t.split(sep)]
-	
-	#for each string, remove quotes and commas, then tokenize based on spaces to generate the final list
-	tokens2=[]
-	for tk in tokens:
-		tk = tk.strip()
-		#strip punctuation, need to leave in _ for filenames and / and \ for paths 
-		tk = re.sub("[\"\',]"," ",tk)
-		for tk2 in tk.split(" "):
-			tokens2.append(tk2)
+    #sep = "tzvlw"
+    #t = basicutils.CompileTextFromRange(start,end,sep)
+    #normally would do lower here to normalize but we lose camel case that way
+    tokens = [tk for tk in t.split(sep)]
+    
+    #for each string, remove quotes and commas, then tokenize based on spaces to generate the final list
+    tokens2=[]
+    for tk in tokens:
+        tk = tk.strip()
+        #strip punctuation, need to leave in _ for filenames and / and \ for paths 
+        tk = re.sub("[\"\',]"," ",tk)
+        for tk2 in tk.split(" "):
+            tokens2.append(tk2)

-	debug_print("source_file_strings tokens2:")
-	debug_print(tokens2)	
+    debug_print("source_file_strings tokens2:")
+    debug_print(tokens2)    

-	b=[]
-	for tk in tokens2:
-		tk = tk.strip()
-		if is_source_file_str(tk):
-			#If there's a dir path, only use the end filename
-			#This could be tweaked if the directory structure is part of the software architecture
-			#e.g. if there are multiple source directories with meaningful names
-			if tk.rfind("/") != -1:
-				ntk = tk[tk.rfind("/")+1:]
-			elif tk.rfind("\\") != -1:
-				ntk = tk[tk.rfind("\\")+1:]
-			else:
-				ntk = tk
-			b.append(ntk)
-			
-	debug_print("source_file_strings tokens:")
-	debug_print(tokens)
-	debug_print(b)
-	
-	#a better way to do this (if there are multiple)
-	#would be to sort, uniquify, and then make the name foo.c_and_bar.c
-	u_gram=""
-	u_gram_score=0
-	if (len(b) > 0):
-		f = nltk.FreqDist(b)
-		u_gram = f.most_common(1)[0][0]
-		u_gram_score = f.most_common(1)[0][1]
-		
-	return (u_gram,u_gram_score)
-	
+    b=[]
+    for tk in tokens2:
+        tk = tk.strip()
+        if is_source_file_str(tk):
+            #If there's a dir path, only use the end filename
+            #This could be tweaked if the directory structure is part of the software architecture
+            #e.g. if there are multiple source directories with meaningful names
+            if tk.rfind("/") != -1:
+                ntk = tk[tk.rfind("/")+1:]
+            elif tk.rfind("\\") != -1:
+                ntk = tk[tk.rfind("\\")+1:]
+            else:
+                ntk = tk
+            b.append(ntk)
+            
+    debug_print("source_file_strings tokens:")
+    debug_print(tokens)
+    debug_print(b)
+    
+    #a better way to do this (if there are multiple)
+    #would be to sort, uniquify, and then make the name foo.c_and_bar.c
+    u_gram=""
+    u_gram_score=0
+    if (len(b) > 0):
+        f = nltk.FreqDist(b)
+        u_gram = f.most_common(1)[0][0]
+        u_gram_score = f.most_common(1)[0][1]
+        
+    return (u_gram,u_gram_score)
+    
 #common_strings(t, sep):
 #Return a list of the common strings in the string "t" - lines separated by "sep"
 #Uses NLTK to generate a list of unigrams, bigrams, and trigrams (1 word, 2 word phrase, 3 word phrase)
@@ -213,90 +213,90 @@ def source_file_strings(t, sep):
 #If the bigram score > 1/2 * unigram score, the most common bigram is used
 #Otherwise the most common unigram (single word is used)
 def common_strings(t,sep):
-	CS_THRESHOLD = 6
-	
-	tokens = string_range_tokenize(t)
-	
-	#make a copy since we're going to edit it
-	u_tokens = tokens
-	c=0
-	while (c<len(u_tokens)):
-		if u_tokens[c] == sep:
-			del u_tokens[c]
-		else:
-			c+=1
-	
-	debug_print("common_strings tokens:")
-	debug_print(tokens)
-	
-	if len(u_tokens) < CS_THRESHOLD:
-		#print("less than threshold")
-		return ("",0)	
-	
-	f = nltk.FreqDist(u_tokens)
-	u_gram = f.most_common(1)[0][0]
-	u_gram_score = f.most_common(1)[0][1]
-	
-	#print "Tokens:"
-	#print tokens
-	#print len(tokens)
-	
-	bgs = list(nltk.bigrams(tokens))
-	c=0
-	while (c<len(bgs)):
-		if sep in bgs[c]:
-			del bgs[c]
-		else:
-			c+=1
-	
-	debug_print("Bigrams:")
-	debug_print(bgs)
-	if (len(bgs) != 0):
-		fs = nltk.FreqDist(bgs)
-		b_gram = fs.most_common(1)[0][0]
-		#print "Most Common:"
-		#print b_gram
-		b_str = b_gram[0] + "_" + b_gram[1]
-		b_gram_score = fs.most_common(1)[0][1]
-	else:
-		b_str =""
-		b_gram_score = 0
-		
-	tgs = list(nltk.trigrams(tokens))
-	c=0
-	while (c<len(tgs)):
-		if sep in tgs[c]:
-			del tgs[c]
-		else:
-			c+=1
-	debug_print("Trigrams:")
-	debug_print(tgs)
-	if (len(tgs) != 0):
-		ft = nltk.FreqDist(tgs)
-		t_gram = ft.most_common(1)[0][0]
-		t_str = t_gram[0] + "_" + t_gram[1] + "_" + t_gram[2]
-		t_gram_score = ft.most_common(1)[0][1]
-	else:
-		t_str = ""
-		t_gram_score = 0
-		
-	
-	debug_print("1: %s - %d 2: %s - %d 3: %s - %d\n" % (u_gram,u_gram_score,b_str,b_gram_score,t_str,t_gram_score))
-	
-	if (b_gram_score > 1) and (b_gram_score * 2 >= u_gram_score):
-		if (t_gram_score > 1) and (t_gram_score * 2 >= b_gram_score):
-			ret = t_str
-			ret_s = t_gram_score
-		else:
-			ret = b_str
-			ret_s = b_gram_score
-	else:
-		ret = u_gram
-		ret_s = u_gram_score
-	
-	return (ret,ret_s)
+    CS_THRESHOLD = 6
+    
+    tokens = string_range_tokenize(t)
+    
+    #make a copy since we're going to edit it
+    u_tokens = tokens
+    c=0
+    while (c<len(u_tokens)):
+        if u_tokens[c] == sep:
+            del u_tokens[c]
+        else:
+            c+=1
+    
+    debug_print("common_strings tokens:")
+    debug_print(tokens)
+    
+    if len(u_tokens) < CS_THRESHOLD:
+        #print("less than threshold")
+        return ("",0)    
+    
+    f = nltk.FreqDist(u_tokens)
+    u_gram = f.most_common(1)[0][0]
+    u_gram_score = f.most_common(1)[0][1]
+    
+    #print "Tokens:"
+    #print tokens
+    #print len(tokens)
+    
+    bgs = list(nltk.bigrams(tokens))
+    c=0
+    while (c<len(bgs)):
+        if sep in bgs[c]:
+            del bgs[c]
+        else:
+            c+=1
+    
+    debug_print("Bigrams:")
+    debug_print(bgs)
+    if (len(bgs) != 0):
+        fs = nltk.FreqDist(bgs)
+        b_gram = fs.most_common(1)[0][0]
+        #print "Most Common:"
+        #print b_gram
+        b_str = b_gram[0] + "_" + b_gram[1]
+        b_gram_score = fs.most_common(1)[0][1]
+    else:
+        b_str =""
+        b_gram_score = 0
+        
+    tgs = list(nltk.trigrams(tokens))
+    c=0
+    while (c<len(tgs)):
+        if sep in tgs[c]:
+            del tgs[c]
+        else:
+            c+=1
+    debug_print("Trigrams:")
+    debug_print(tgs)
+    if (len(tgs) != 0):
+        ft = nltk.FreqDist(tgs)
+        t_gram = ft.most_common(1)[0][0]
+        t_str = t_gram[0] + "_" + t_gram[1] + "_" + t_gram[2]
+        t_gram_score = ft.most_common(1)[0][1]
+    else:
+        t_str = ""
+        t_gram_score = 0
+        
+    
+    debug_print("1: %s - %d 2: %s - %d 3: %s - %d\n" % (u_gram,u_gram_score,b_str,b_gram_score,t_str,t_gram_score))
+    
+    if (b_gram_score > 1) and (b_gram_score * 2 >= u_gram_score):
+        if (t_gram_score > 1) and (t_gram_score * 2 >= b_gram_score):
+            ret = t_str
+            ret_s = t_gram_score
+        else:
+            ret = b_str
+            ret_s = b_gram_score
+    else:
+        ret = u_gram
+        ret_s = u_gram_score
+    
+    return (ret,ret_s)

-### End of NLP Section ###	
+### End of NLP Section ###    



@@ -308,44 +308,44 @@ def common_strings(t,sep):
 #You can tweak the switchover thresholds below.

 def guess_module_names(t,sep):
-	#idea - make score threshold based on the size of the module
-	# (e.g. smaller modules should have a smaller threshold
-	C_SCORE_THRESHOLD = 4 #we need to see at least <N> occurrences of a string set in order to pick that name
-	S_SCORE_THRESHOLD = 2 #if we see <N> occurrences of foo.c we'll pick "foo.c"
-	B_SCORE_THRESHOLD = 2 #if we see <N> occurrences of [foo] we'll pick "foo"
+    #idea - make score threshold based on the size of the module
+    # (e.g. smaller modules should have a smaller threshold
+    C_SCORE_THRESHOLD = 4 #we need to see at least <N> occurrences of a string set in order to pick that name
+    S_SCORE_THRESHOLD = 2 #if we see <N> occurrences of foo.c we'll pick "foo.c"
+    B_SCORE_THRESHOLD = 2 #if we see <N> occurrences of [foo] we'll pick "foo"

-		# first look for strings that start with [FOO], (bracket strings)
-		# then look for strings that contain source files (.c,.cpp,etc.)
-		# then try common strings
-		# above thresholds can be tweaked - they represent the number of strings that have to be repeated
-		# in order to use that string as the module name
-	(name,scr) = bracket_strings(t,"[","]",sep)
-	debug_print("bracket name: %s score: %d" %(name, scr))
-	#if (True):
-	if (scr < B_SCORE_THRESHOLD):		
-		(name,scr) = source_file_strings(t,sep)
-		debug_print("source name: %s score: %d" % (name, scr))
-		#if (True):e
-		if (scr < S_SCORE_THRESHOLD):			
-			(name,scr) = common_strings(t,sep)
-			debug_print("common name: %s score: %d" % (name, scr))
-			if (scr < C_SCORE_THRESHOLD):
-				#Couldn't come up with a name
-				name = "unknown"
+        # first look for strings that start with [FOO], (bracket strings)
+        # then look for strings that contain source files (.c,.cpp,etc.)
+        # then try common strings
+        # above thresholds can be tweaked - they represent the number of strings that have to be repeated
+        # in order to use that string as the module name
+    (name,scr) = bracket_strings(t,"[","]",sep)
+    debug_print("bracket name: %s score: %d" %(name, scr))
+    #if (True):
+    if (scr < B_SCORE_THRESHOLD):        
+        (name,scr) = source_file_strings(t,sep)
+        debug_print("source name: %s score: %d" % (name, scr))
+        #if (True):e
+        if (scr < S_SCORE_THRESHOLD):            
+            (name,scr) = common_strings(t,sep)
+            debug_print("common name: %s score: %d" % (name, scr))
+            if (scr < C_SCORE_THRESHOLD):
+                #Couldn't come up with a name
+                name = "unknown"

-	return name
+    return name

 def main():
-	#t=""
-	sep = "tzvlw"
-	# java side handles adding sep between strings,
-	# read all in at once (no newlines between strings)
-	#t = sys.stdin.readline()
-	t = input()
-	#print ("text in: %s" % t)
-	name = guess_module_names(t,sep)
-	print(name)
+    #t=""
+    sep = "tzvlw"
+    # java side handles adding sep between strings,
+    # read all in at once (no newlines between strings)
+    #t = sys.stdin.readline()
+    t = input()
+    #print ("text in: %s" % t)
+    name = guess_module_names(t,sep)
+    print(name)


 if __name__ == "__main__":
-	main()
+    main()
--- a/codecut-gui/src/main/java/codecutguiv2/CodeCutGUIPlugin.java
+++ b/codecut-gui/src/main/java/codecutguiv2/CodeCutGUIPlugin.java
@@ -791,6 +791,31 @@ public class CodeCutGUIPlugin extends ProgramPlugin implements DomainObjectListe
 		
 	}
 	
+	private class ModuleNamerV2 extends GhidraScript{
+		Program program = GhidraProgramUtilities.getCurrentProgram(tool);
+		GhidraState state = new GhidraState(tool, tool.getProject(), program, null, null, null);
+		String start_addr; 
+		String end_addr;
+		String path; 
+		
+		public ModuleNamerV2(String start, String end, File file) {
+			this.start_addr = start;
+			this.end_addr = end; 
+		}
+		@Override
+		public void run() {
+			String[] args = {start_addr, end_addr}; 
+			try {
+				runScript("range.py", args);
+				
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+		}
+		
+	}
+	
+	
 	private void createExportActions() {
 //Need Decompiler extensions		
 		/*
@@ -1255,6 +1280,7 @@ public class CodeCutGUIPlugin extends ProgramPlugin implements DomainObjectListe
 		}
 		
 	}
+	
 	private class OFileExporter extends GhidraScript{
 		
 		GhidraState state;
--- a/codecut-gui/src/main/java/codecutguiv2/ModNamingAnalyzer.java
+++ b/codecut-gui/src/main/java/codecutguiv2/ModNamingAnalyzer.java
@@ -146,9 +146,6 @@ public class ModNamingAnalyzer {
 		return allStrings;
    }
    
-    private String guessSingleModule(List<String> strList) {
-    	return "";
-    }
    
    public void guessModuleNames() {
 		Task guessNamesTask = new Task("Guess Module Names", true, true, true) {
@@ -186,7 +183,7 @@ public class ModNamingAnalyzer {
 								//if name is "unknown" (e.g. modnaming found no repeated strings) don't bother renaming 
 								if (suggestedName.equals("unknown")) {
 									Msg.info(this, "No name guess found for module " + ns.getName() + ", leaving unchanged");
-									break;
+									continue;
 								}

 								suggestedModuleNames.put(ns, suggestedName);
@@ -204,16 +201,18 @@ public class ModNamingAnalyzer {
 									num++;
 								}
 								Namespace newNs = null;
-								int transactionId = currentProgram.startTransaction("ns");
+								
+								int transactionId = currentProgram.startTransaction("CreateNamespace");
+								boolean success = false;
 								try {
-									newNs = currentProgram.getSymbolTable().createNameSpace(ns.getParentNamespace(), newName, SourceType.USER_DEFINED);
-									Msg.info(this, "Created NS with new name " + newName + " for module " + ns.getName());
+								    newNs = currentProgram.getSymbolTable()
+								            .createNameSpace(ns.getParentNamespace(), newName, SourceType.USER_DEFINED);
+								    success = true;
+								} catch (DuplicateNameException ex) {
+								    Msg.error(this, "Failed to create namespace for suggested name " + suggestedName, ex);
+								} finally {
+								    currentProgram.endTransaction(transactionId, success);
 								}
-								catch (DuplicateNameException ex) {
-									Msg.error(this, "Failed when trying to find and set name for suggested name " + suggestedName);
-									currentProgram.endTransaction(transactionId, false);
-								}
-								currentProgram.endTransaction(transactionId, true);
 								
 								try {
 									CodecutUtils.renameNamespace(currentProgram, ns, newNs);
--- a/deepcut-ghidra/ghidra_scripts/deepcut.py
+++ b/deepcut-ghidra/ghidra_scripts/deepcut.py
@@ -36,6 +36,7 @@ import torch
 from math import log2, copysign
 from networkx import DiGraph
 from scipy.linalg import toeplitz
+from scipy.sparse import coo_matrix, diags, csr_matrix

 import GNN_Net

@@ -106,38 +107,44 @@ class Deepcut:

    def _adjacency_matrix(self):
        num_funcs = len(self.graph.nodes)
-        A = np.zeros((num_funcs, num_funcs))

-        for e, v in zip(self.graph_connectivity, self.predicted_labels):
+        # Build sparse adjacency from predicted edge scores
+        rows = []
+        cols = []
+        vals = []
+        for (e, v) in zip(self.graph_connectivity, self.predicted_labels.flatten()):
            e0, e1 = e
-            A[e0, e1] = v
+            rows.append(e0)
+            cols.append(e1)
+            vals.append(float(v))

-        A += A.T
-        A *= 0.5
+        A = coo_matrix((vals, (rows, cols)), shape=(num_funcs, num_funcs))

-        """
-        add a small connection between adjacent nodes,
-        essentially to break ties in favor of merging communities
-        """
-        x = np.zeros(num_funcs)
-        x[1] = 0.05
-        A += toeplitz(x)
+        # Symmetrize and average: (A + A^T)/2
+        A = (A + A.T).multiply(0.5).tocsr()

-        return A
+        # Add small off-diagonal connection (equivalent to toeplitz([0, 0.05, 0, ...]))
+        off = diags([0.05 * np.ones(num_funcs - 1), 0.05 * np.ones(num_funcs - 1)],
+                    offsets=[-1, 1], shape=(num_funcs, num_funcs), format='csr')
+        A = (A + off).tocsr()
+
+        return A  # CSR sparse matrix

    def _modularity(self):
-        adj_matrix = self._adjacency_matrix()
-        # node degrees
-        k = np.sum(adj_matrix, axis=0)
-
-        k2 = np.array([k])
-        B = k2.T @ k2
-        B /= 2 * np.sum(k2)
-
-        Q = adj_matrix - B
+        A = self._adjacency_matrix()  # sparse CSR
+        # node degrees (as dense 1D array for lightweight vector ops)
+        k = np.array(A.sum(axis=0)).ravel()
+        two_m = 2.0 * k.sum()  # denominator used in modularity B term

        def compute_partial_modularity(start, stop):
-            return np.sum(Q[start:stop, start:stop])
+            # Sum of A over the block [start:stop, start:stop]
+            A_block_sum = A[start:stop, start:stop].sum()
+            # Sum of degrees in the block
+            k_block_sum = k[start:stop].sum()
+            # Sum of B over the block: (sum_k_block)^2 / (2m)
+            B_block_sum = (k_block_sum * k_block_sum) / two_m if two_m > 0 else 0.0
+            # Return sum(Q_block) = sum(A_block) - sum(B_block)
+            return float(A_block_sum) - float(B_block_sum)

        scores = [0.0]
        scores = np.array(scores)
@@ -148,15 +155,15 @@ class Deepcut:

        for index in range(1, len(self.graph.nodes)):
            update = [compute_partial_modularity(i, index) for i in
-                      range(max(0, index-max_cluster_size), index)]
+                      range(max(0, index - max_cluster_size), index)]
            if index > max_cluster_size:
-                update = [0]*(index-max_cluster_size) + update
+                update = [0] * (index - max_cluster_size) + update
            updated_scores = scores + update

            i = np.argmax(updated_scores)

            if index > max_cluster_size:
-                i = np.argmax(updated_scores[index-max_cluster_size:])+ (index - max_cluster_size)
+                i = np.argmax(updated_scores[index - max_cluster_size:]) + (index - max_cluster_size)

            s = updated_scores[i]
            c = cuts[i] + [index]
--- a/deepcut-ghidra/gradle/wrapper/gradle-wrapper.properties
+++ b/deepcut-ghidra/gradle/wrapper/gradle-wrapper.properties
@@ -1,5 +0,0 @@
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-6.6.1-bin.zip
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
--- a/deepcut-ghidra/gradlew
+++ b/deepcut-ghidra/gradlew
@@ -1,185 +0,0 @@
-#!/usr/bin/env sh
-
-#
-# Copyright 2015 the original author or authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-##############################################################################
-##
-##  Gradle start up script for UN*X
-##
-##############################################################################
-
-# Attempt to set APP_HOME
-# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
-done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
-
-APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
-
-warn () {
-    echo "$*"
-}
-
-die () {
-    echo
-    echo "$*"
-    echo
-    exit 1
-}
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-nonstop=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-  NONSTOP* )
-    nonstop=true
-    ;;
-esac
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
-    else
-        JAVACMD="$JAVA_HOME/bin/java"
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD="java"
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-fi
-
-# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
-fi
-
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
-
-# For Cygwin or MSYS, switch paths to Windows format before running java
-if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
-        fi
-        i=`expr $i + 1`
-    done
-    case $i in
-        0) set -- ;;
-        1) set -- "$args0" ;;
-        2) set -- "$args0" "$args1" ;;
-        3) set -- "$args0" "$args1" "$args2" ;;
-        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
-fi
-
-# Escape application args
-save () {
-    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
-    echo " "
-}
-APP_ARGS=`save "$@"`
-
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
-
-exec "$JAVACMD" "$@"
--- a/deepcut-ghidra/gradlew.bat
+++ b/deepcut-ghidra/gradlew.bat
@@ -1,89 +0,0 @@
-@rem
-@rem Copyright 2015 the original author or authors.
-@rem
-@rem Licensed under the Apache License, Version 2.0 (the "License");
-@rem you may not use this file except in compliance with the License.
-@rem You may obtain a copy of the License at
-@rem
-@rem      https://www.apache.org/licenses/LICENSE-2.0
-@rem
-@rem Unless required by applicable law or agreed to in writing, software
-@rem distributed under the License is distributed on an "AS IS" BASIS,
-@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@rem See the License for the specific language governing permissions and
-@rem limitations under the License.
-@rem
-
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Resolve any "." and ".." in APP_HOME to make it shorter.
-for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto execute
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto execute
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
--- a/deepcut-ghidra/src/main/help/help/TOC_Source.xml
+++ b/deepcut-ghidra/src/main/help/help/TOC_Source.xml
@@ -1,57 +0,0 @@
-<?xml version='1.0' encoding='ISO-8859-1' ?>
-<!-- 
-
-	This is an XML file intended to be parsed by the Ghidra help system.  It is loosely based 
-	upon the JavaHelp table of contents document format.  The Ghidra help system uses a 
-	TOC_Source.xml file to allow a module with help to define how its contents appear in the 
-	Ghidra help viewer's table of contents.  The main document (in the Base module) 
-	defines a basic structure for the 
-	Ghidra table of contents system.  Other TOC_Source.xml files may use this structure to insert
-	their files directly into this structure (and optionally define a substructure).
-	
-	
-	In this document, a tag can be either a <tocdef> or a <tocref>.  The former is a definition
-	of an XML item that may have a link and may contain other <tocdef> and <tocref> children.  
-	<tocdef> items may be referred to in other documents by using a <tocref> tag with the 
-	appropriate id attribute value.  Using these two tags allows any module to define a place 
-	in the table of contents system (<tocdef>), which also provides a place for 
-	other TOC_Source.xml files to insert content (<tocref>).  
-	
-	During the help build time, all TOC_Source.xml files will be parsed and	validated to ensure
-	that all <tocref> tags point to valid <tocdef> tags.  From these files will be generated
-	<module name>_TOC.xml files, which are table of contents files written in the format 
-	desired by the JavaHelp system.   Additionally, the genated files will be merged together
-	as they are loaded by the JavaHelp system.  In the end, when displaying help in the Ghidra
-	help GUI, there will be on table of contents that has been created from the definitions in 
-	all of the modules' TOC_Source.xml files.
-
-	
-	Tags and Attributes
-	
-	<tocdef>
-	-id          - the name of the definition (this must be unique across all TOC_Source.xml files)	
-	-text        - the display text of the node, as seen in the help GUI
-	-target**    - the file to display when the node is clicked in the GUI
-	-sortgroup   - this is a string that defines where a given node should appear under a given
-	               parent.  The string values will be sorted by the JavaHelp system using
-	               a javax.text.RulesBasedCollator.  If this attribute is not specified, then
-	               the text of attribute will be used.
-
-	<tocref>
-	-id			 - The id of the <tocdef> that this reference points to 
-	
-	**The URL for the target is relative and should start with 'help/topics'.  This text is 
-	used by the Ghidra help system to provide a universal starting point for all links so that
-	they can be resolved at runtime, across modules.
-	
-	
-->
-
-
-<tocroot>
-	<!-- Uncomment and adjust fields to add help topic to help system's Table of Contents
-	<tocref id="Ghidra Functionality">
-		<tocdef id="HelpAnchor" text="My Feature" target="help/topics/my_topic/help.html" />
-	</tocref>
-	-->
-</tocroot>
--- a/deepcut-ghidra/src/main/help/help/shared/Frontpage.css
+++ b/deepcut-ghidra/src/main/help/help/shared/Frontpage.css
@@ -1,64 +0,0 @@
-/* ###
- * IP: GHIDRA
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *      http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
-									WARNING!
-    This file is copied to all help directories.  If you change this file, you must copy it 
-    to each src/main/help/help/shared directory.									
-									
-	
-	Java Help Note:  JavaHelp does not accept sizes (like in 'margin-top') in anything but 
-	px (pixel) or with no type marking. 
-
-*/ 
-
-body { margin-bottom: 50px; margin-left: 10px; margin-right: 10px; margin-top: 10px; } /* some padding to improve readability */ 
-li { font-family:times new roman; font-size:14pt; }
-h1 { color:#000080; font-family:times new roman; font-size:36pt; font-style:italic; font-weight:bold; text-align:center; }
-h2 { margin: 10px; margin-top: 20px; color:#984c4c; font-family:times new roman; font-size:18pt; font-weight:bold; }
-h3 { margin-left: 10px; margin-top: 20px; color:#0000ff; font-family:times new roman; `font-size:14pt; font-weight:bold;  }
-h4 { margin-left: 10px; margin-top: 20px; font-family:times new roman; font-size:14pt; font-style:italic; }
- 
-/*
-	 P tag code.  Most of the help files nest P tags inside of blockquote tags (the was the 
-	 way it had been done in the beginning).  The net effect is that the text is indented.  In 
-	 modern HTML we would use CSS to do this.  We need to support the Ghidra P tags, nested in
-	 blockquote tags, as well as naked P tags.  The following two lines accomplish this.  Note
-	 that the 'blockquote p' definition will inherit from the first 'p' definition.
-*/
-p { margin-left: 40px; font-family:times new roman; font-size:14pt; }
-blockquote p { margin-left: 10px; }
-
-p.providedbyplugin { color:#7f7f7f; margin-left: 10px; font-size:14pt; margin-top:100px  }
-p.ProvidedByPlugin { color:#7f7f7f; margin-left: 10px; font-size:14pt; margin-top:100px }
-p.relatedtopic { color:#800080; margin-left: 10px; font-size:14pt; }
-p.RelatedTopic { color:#800080; margin-left: 10px; font-size:14pt; }
-
-/* 
-	We wish for a tables to have space between it and the preceding element, so that text
-	is not too close to the top of the table.  Also, nest the table a bit so that it is clear
-	the table relates to the preceding text.
-*/
-table { margin-left: 20px; margin-top: 10px; width: 80%;}
-td { font-family:times new roman; font-size:14pt; vertical-align: top; }
-th { font-family:times new roman; font-size:14pt; font-weight:bold; background-color: #EDF3FE; }
-
-/*
-	Code-like formatting for things such as file system paths and proper names of classes, 
-	methods, etc.  To apply this to a file path, use this syntax:
-		<CODE CLASS="path">...</CODE>
-*/
-code { color: black; font-weight: bold; font-family: courier new, monospace; font-size: 14pt; white-space: nowrap; }
-code.path { color: #4682B4; font-weight: bold; font-family: courier new, monospace; font-size: 14pt; white-space: nowrap; }
--- a/deepcut-ghidra/src/main/java/deepcut/DeepCutAnalyzer.java
+++ b/deepcut-ghidra/src/main/java/deepcut/DeepCutAnalyzer.java
@@ -30,13 +30,11 @@ import java.io.FileNotFoundException;
 import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;

-import generic.jar.ResourceFile;
 import ghidra.app.script.GhidraScriptLoadException;
 import ghidra.app.services.AbstractAnalyzer;
 import ghidra.app.services.AnalysisPriority;
 import ghidra.app.services.AnalyzerType;
 import ghidra.app.util.importer.MessageLog;
-import ghidra.framework.Application;
 import ghidra.framework.options.Options;
 import ghidra.program.model.address.Address;
 import ghidra.program.model.address.AddressFactory;
@@ -153,6 +151,7 @@ public class DeepCutAnalyzer extends AbstractAnalyzer {

    private void addNamespace(Program program, String name, Function function)
            throws DuplicateNameException, InvalidInputException, CircularDependencyException {
+   	
        SymbolTable symbolTable = program.getSymbolTable();
        Namespace ns = symbolTable.getNamespace(name, null);
        if (ns == null) {