CodeCut/lfa.py

##############################################################################################
# Copyright 2018 The Johns Hopkins University Applied Physics Laboratory LLC
# All rights reserved.
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
# software and associated documentation files (the "Software"), to deal in the Software
# without restriction, including without limitation the rights to use, copy, modify,
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.
#
# HAVE A NICE DAY.

################################################################################
###  Object File Boundary Detection in IDA Pro with Local Function Affinity  ###
################################################################################

# LFA Metric
# Local Function Affinity (LFA) is a measurement of the direction a function
# is being "pulled" by the functions it calls and the functions that call it.
# By looking at an average of the log of the distance between these functions
# we get a measurement of whether the function is related to functions in the
# positive or negative direction.

# Edge Detection
# In a standard C/C++ development environment, the project is divided into
# multiple source files, which are compiled to object files, then linked into
# the final binary in order.  If external references are eliminated (LFA does
# this imperfectly by just eliminating calls whose distance is above a chosen
# threshold) we would expect to see LFA starting positive, switching to
# negative over the course of a source file, then switching back to positive
# at the beginning of the next file.  So object file boundaries

# What is code anyway?
# Don't get too hung up on "object file boundaries" - for LFA (or any other
# attempt to solve the problem) to be perfect, the design and implementation
# of the code would have to be perfect.  What LFA is really finding is clusters
# of functionality, that should be more or less related to object files
# but it will often break up large object files into multiple clusters or
# detect 2 or 3 related object files as one file.

IDA_VERSION = 7
import basicutils_7x as basicutils

#External dependencies
import math

#CodeCut dependencies
import cc_base
import module

#Threshold above which a function call is considered "external"
#For published research - 0x1000 = 4K
MAX_CALL = 0x1000

#This is a list of the LFA scores for all functions
g_function_list = []

#This is a list of modules a.k.a. object files after the edge_detect()
#function is executed
g_module_list = []


#func_callers_weight(f):
#Return the LFA score for functions that this functions calls (i.e. the "calls from" score)
#If there are no references, return 0
def func_callers_weight(f):
	fc = 0
	fs = 0
	for xref in basicutils.FuncXrefsFrom(f):
		dist = abs(xref - f)
		#print "%08x:  %08x %d " % (f, xref, dist),
		if dist > MAX_CALL:
			continue
		if (dist != 0):
			logdist = math.log(dist)
		else: #recursive function call
			logdist = 0
		if (xref - f < 0):
			o = -logdist
		else:
			o = logdist
			#print " %f " % o,
		fs += o
		fc += 1

	if fc == 0:
		score = 0
	else:
		score = fs / fc
	return score

#func_callee_weight(f):
#Return the LFA score for calls where this function is the "callee" (i.e. the "calls to" score)
#If there are no references, return 0
def func_callee_weight(f):
	fc = 0
	fs = 0
	a = 0
	for xref in basicutils.CodeRefsTo(f):

		dist = abs(xref - f)
		#print "%08x:  %08x %d " % (f, xref, dist),
		if dist > MAX_CALL:
			continue
		if (dist != 0):
			logdist = math.log(dist)
		else: #recursive function call
			logdist = 0
		if (xref - f < 0):
			o = -logdist
		else:
			o = logdist
			#print " %f " % o,
		fs += o
		fc += 1


	if fc == 0:
		score = 0
	else:
		score = fs / fc
	return score

#func_call_weight(start,end):
#Iterate over each function in the range and calculated the LFA scores
# If both scores are 0, skip the function altogether, exclude it from the list
# If one score is 0, interpolate that score from the previous score
def func_call_weight(f_start, f_end):
	global g_function_list

	c = 1
	f = f_start
	fe = f_end

	if f==0:
		f = basicutils.NextFunction(0)
		f_end = basicutils.BADADDR

	prevscore = 0
	prevscore_1 = 0
	prevscore_2 = 0
	z1 = 0
	z2 = 0

	#for each function in range
	while (f < fe):

		#get both LFA scores for the function
		score_1 = func_callers_weight(f)
		score_2 = func_callee_weight(f)

		#if both scores are 0 (i.e. no references for the function or all refs are above the threshold)
		#then skip the function altogether
		if (score_1 == 0) and (score_2 == 0):
			print "Skipping 0x%08x\n" % f
			prevscore_1 = 0
			prevscore_2 = 0
			z1 = 1
			z2 = 1
			finf = module.func_info(f,0,0)
			finf.lfa_skip=1
			g_function_list.append(finf)
			f = basicutils.NextFunction(f)
			continue

		#if 1st or 2nd score is zero, interpolate using previous score and an assumed negative linear slope
		#otherwise use the score
		if (score_1 == 0):
			score_1 = prevscore_1 - z1 * .4
			z1 += 1
		else:
			prevscore_1 = score_1
			z1 = 1
		if (score_2 == 0):
			score_2 = prevscore_2 - z2 * .4
			z2 += 1
		else:
			prevscore_2 = score_2
			z2 = 1

		total_score = score_1 + score_2

		#Output scores in log window
		print "0x%08x, %d , %f, %f, %f" % (f, c,score_1, score_2, total_score)

		#Add scores to the global function score list
		finf = module.func_info(f,score_1,score_2)
		finf.lfa_skip=0
		g_function_list.append(finf)

		line = "0x%08x, %d , %f, %f, %f\n" % (f,c,score_1, score_2, total_score)
		f=basicutils.NextFunction(f)
		c+=1

#get_last _three and get_lfa_start:
#Previously LFA would just skip functions if they had no caller or callee score
#it would effectively drop them.  This meant that when doing edge detection we
#knew every function in the function list had a score.  Now we're putting all
#functions in the function list, and we have a "skip" field if LFA should skip it
#for scoring purposes.  So these functions help parse that skip field, since for
#edge detection we look at the previous three scores.
def get_last_three(index):
	c=0
	i = index-1
	p=[]
	while ((c<3) and (i>0)):
		print "get_last_3: %d,%d" % (c,i)
		if (g_function_list[i].lfa_skip == 0):
			p.append(g_function_list[i])
			c+=1
		i-=1
	if (c==3):
		return p[0],p[1],p[2]
	else:
		print "Error: could not find 3 scored entries before index: %d  (%d,%d)" % (index, i, c)
		return 0,0,0

def get_lfa_start():
	c=0;
	i=0;
	while (c < 4):
		print "get_lfa_start: %d,%d" % (c,i)
		if (g_function_list[i].lfa_skip==0):
			c+=1
		i+=1
	return i

#edge_detect():
# Determine boundaries between object files
#  Edge condition is a delta of at least 2 where the current score is positive
#      and 2 of the last 3 scores were negative (negative trend)
def edge_detect():
	global g_function_list
	global g_module_list

	#For published research
	EDGE_THRESHOLD = 2

	c=get_lfa_start()
	#do edge detection
	while (c<len(g_function_list)):
		#TODO: this is not working as previously intended
		#because the last 3 can have "skipped" entries in them
		if (g_function_list[c].lfa_skip == 0):
			f_1,f_2,f_3 = get_last_three(c)
			p_1 = f_1.total_score
			p_2 = f_2.total_score
			p_3 = f_3.total_score
			#p_1 = g_function_list[c-1].total_score
			#p_2 = g_function_list[c-2].total_score
			#p_3 = g_function_list[c-3].total_score
			s = g_function_list[c].total_score
			#if score is positive and it is diff of at least 2 from previous
			#and the previous function was not an edge
			if ((not f_1.edge[0] == 1) and (s > 0) and ((s - p_1) > EDGE_THRESHOLD)):
				#if 2 of last 3 were negative
				m = sorted([p_1,p_2,p_3])
				if (m[1] < 0):
					g_function_list[c].edge[0]=1
		c+=1
	#assign modules based on where the edges are
	c=0
	mod_start = g_function_list[0].loc
	while(c<len(g_function_list)):
		f = g_function_list[c]
		if (f.edge[0] == 1):
			p = g_function_list[c-1]
			b_mod = module.bin_module(mod_start,p.loc,0,"")
			mod_start = f.loc #set the start of the next module to this function (where edge was detected)
			g_module_list.append(b_mod)
		c+=1

#Main entry point - returns an LFA module list and a global function list (with the LFA module edges marked)
def analyze():
	global g_function_list
	global g_module_list

	#Define range to analyze
	#just do .text segment if we've got one
	#otherwise just start from the first function in DB
	start,end = basicutils.SegByName(".text")
	if (start == basicutils.BADADDR):
		start = basicutils.NextFunction(0)
		end = basicutils.BADADDR

	#Calculate LFA score for all functions
	func_call_weight(start,end)
	#Detect edges - object file boundaries
	edge_detect()

	return g_function_list, g_module_list