Skip to content

Commit

Permalink
Add 2 new prompts for finding vulns and recovering man pages (#57)
Browse files Browse the repository at this point in the history
* Add 2 new prompts for finding vulns and recovering man pages

* update readme

* put back

* remove debug

* Fix version

* Hide things
  • Loading branch information
mahaloz authored Sep 17, 2024
1 parent f56f773 commit cb76283
Show file tree
Hide file tree
Showing 8 changed files with 584 additions and 28 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ Currently, DAILA supports the following prompts:
- Rename variables
- Rename function
- Identify the source of a function
- Find potential vulnerabilities in a function
- Summarize the man page of a library call

### VarBERT
VarBERT is a local BERT model from the S&P 2024 paper [""Len or index or count, anything but v1": Predicting Variable Names in Decompilation Output with Transfer Learning"]().
Expand Down
10 changes: 5 additions & 5 deletions dailalib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "3.7.1"
__version__ = "3.8.0"

from .api import AIAPI, LiteLLMAIAPI
from libbs.api import DecompilerInterface
Expand All @@ -13,13 +13,13 @@ def create_plugin(*args, **kwargs):
litellm_api = LiteLLMAIAPI(delay_init=True)
# create context menus for prompts
gui_ctx_menu_actions = {
f"DAILA/LLM/{prompt_name}": (prompt.desc, lambda *x, **y: getattr(litellm_api, prompt_name)(*x, **y))
f"DAILA/LLM/{prompt_name}": (prompt.desc, getattr(litellm_api, prompt_name))
for prompt_name, prompt in litellm_api.prompts_by_name.items()
}
# create context menus for others
gui_ctx_menu_actions["DAILA/LLM/update_api_key"] = ("Update API key...", litellm_api.ask_api_key)
gui_ctx_menu_actions["DAILA/LLM/update_pmpt_style"] = ("Change prompt style...", litellm_api.ask_prompt_style)
gui_ctx_menu_actions["DAILA/LLM/update_model"] = ("Change model...", litellm_api.ask_model)
gui_ctx_menu_actions["DAILA/LLM/Settings/update_api_key"] = ("Update API key...", litellm_api.ask_api_key)
gui_ctx_menu_actions["DAILA/LLM/Settings/update_pmpt_style"] = ("Change prompt style...", litellm_api.ask_prompt_style)
gui_ctx_menu_actions["DAILA/LLM/Settings/update_model"] = ("Change model...", litellm_api.ask_model)

#
# VarModel API (local variable renaming)
Expand Down
11 changes: 10 additions & 1 deletion dailalib/api/ai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ def _requires_function(*args, ai_api: "AIAPI" = None, **kwargs):
dec_text = kwargs.pop("dec_text", None)
use_dec = kwargs.pop("use_dec", True)
has_self = kwargs.pop("has_self", True)
number_lines = kwargs.pop("number_lines", False)
context = kwargs.pop("context", None)
# make the self object the new AI API, should only be used inside an AIAPI class
if not ai_api and has_self:
ai_api = args[0]
Expand All @@ -95,7 +97,9 @@ def _requires_function(*args, ai_api: "AIAPI" = None, **kwargs):

# we must have a UI if we have no func
if function is None:
function = ai_api._dec_interface.functions[ai_api._dec_interface.gui_active_context().func_addr]
if context is None:
context = ai_api._dec_interface.gui_active_context()
function = ai_api._dec_interface.functions[context.func_addr]

# get new text with the function that is present
if dec_text is None:
Expand All @@ -105,6 +109,11 @@ def _requires_function(*args, ai_api: "AIAPI" = None, **kwargs):

dec_text = decompilation.text

if number_lines:
# put a number in front of each line
dec_lines = dec_text.split("\n")
dec_text = "\n".join([f"{i + 1} {line}" for i, line in enumerate(dec_lines)])

return f(*args, function=function, dec_text=dec_text, use_dec=use_dec, **kwargs)

return _requires_function
Expand Down
36 changes: 29 additions & 7 deletions dailalib/api/litellm/prompts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,43 +7,53 @@ class PromptNames:
RENAME_VARS = "RENAME_VARIABLES"
SUMMARIZE_FUNC = "SUMMARIZE_FUNCTION"
ID_SRC = "IDENTIFY_SOURCE"
FIND_VULN = "FIND_VULN"
MAN_PAGE = "MAN_PAGE"


def get_prompt_template(prompt_name, prompt_style):
if prompt_style in [PromptType.FEW_SHOT, PromptType.ZERO_SHOT]:
from .few_shot_prompts import RENAME_FUNCTION, RENAME_VARIABLES, SUMMARIZE_FUNCTION, IDENTIFY_SOURCE
from .few_shot_prompts import (
RENAME_FUNCTION, RENAME_VARIABLES, SUMMARIZE_FUNCTION, IDENTIFY_SOURCE, FIND_VULN, MAN_PAGE
)
d = {
PromptNames.RENAME_FUNC: RENAME_FUNCTION,
PromptNames.RENAME_VARS: RENAME_VARIABLES,
PromptNames.SUMMARIZE_FUNC: SUMMARIZE_FUNCTION,
PromptNames.ID_SRC: IDENTIFY_SOURCE
PromptNames.ID_SRC: IDENTIFY_SOURCE,
PromptNames.FIND_VULN: FIND_VULN,
PromptNames.MAN_PAGE: MAN_PAGE,
}
elif prompt_style == PromptType.COT:
from .cot_prompts import RENAME_FUNCTION, RENAME_VARIABLES, SUMMARIZE_FUNCTION, IDENTIFY_SOURCE
from .cot_prompts import (
RENAME_FUNCTION, RENAME_VARIABLES, SUMMARIZE_FUNCTION, IDENTIFY_SOURCE, FIND_VULN, MAN_PAGE
)
d = {
PromptNames.RENAME_FUNC: RENAME_FUNCTION,
PromptNames.RENAME_VARS: RENAME_VARIABLES,
PromptNames.SUMMARIZE_FUNC: SUMMARIZE_FUNCTION,
PromptNames.ID_SRC: IDENTIFY_SOURCE
PromptNames.ID_SRC: IDENTIFY_SOURCE,
PromptNames.FIND_VULN: FIND_VULN,
PromptNames.MAN_PAGE: MAN_PAGE,
}
else:
raise ValueError("Invalid prompt style")

return d[prompt_name]
return d.get(prompt_name, None)


PROMPTS = [
Prompt(
"summarize",
PromptNames.SUMMARIZE_FUNC,
desc="Summarize the function",
desc="Summarize this function",
response_key="summary",
gui_result_callback=Prompt.comment_function
),
Prompt(
"identify_source",
PromptNames.ID_SRC,
desc="Identify the source of the function",
desc="Identify the source of this function",
response_key="link",
gui_result_callback=Prompt.comment_function
),
Expand All @@ -59,4 +69,16 @@ def get_prompt_template(prompt_name, prompt_style):
desc="Suggest a function name",
gui_result_callback=Prompt.rename_function
),
Prompt(
"find_vulnerabilities",
PromptNames.FIND_VULN,
desc="Find vulnerabilities in this function",
gui_result_callback=Prompt.comment_vulnerability
),
Prompt(
"man_page",
PromptNames.MAN_PAGE,
desc="Summarize library call man page",
gui_result_callback=Prompt.comment_man_page
),
]
230 changes: 230 additions & 0 deletions dailalib/api/litellm/prompts/cot_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,3 +309,233 @@
You respond with:
"""


FIND_VULN = f"""
{COT_PREAMBLE}
All experts will be asked to identify vulnerabilities or bugs in code. When given code, you identify
vulnerabilities and specify the type of vulnerability. Only identify the MOST important vulnerabilities in the code.
Ignore bugs like resource leaks.
{COT_MIDAMBLE}
The question is how to identify vulnerabilities in the code given all the information we got.
Note that the vulnerabilities must be specific and include the line numbers where they occur. If you are unsure
of the vulnerability, please do not guess.
{COT_POSTAMBLE}
""" + """
# Example
Here is an example. Given the following code:
```
1 int __fastcall __noreturn main(int argc, const char **argv, const char **envp)
2 {
3 Human *v3; // rbx
4 __int64 v4; // rdx
5 Human *v5; // rbx
6 int v6; // eax
7 __int64 v7; // rax
8 Human *v8; // rbx
9 Human *v9; // rbx
10 char v10[16]; // [rsp+10h] [rbp-50h] BYREF
11 char v11[8]; // [rsp+20h] [rbp-40h] BYREF
12 Human *v12; // [rsp+28h] [rbp-38h]
13 Human *v13; // [rsp+30h] [rbp-30h]
14 size_t nbytes; // [rsp+38h] [rbp-28h]
15 void *buf; // [rsp+40h] [rbp-20h]
16 int v16; // [rsp+48h] [rbp-18h] BYREF
17 char v17; // [rsp+4Eh] [rbp-12h] BYREF
18 char v18[17]; // [rsp+4Fh] [rbp-11h] BYREF
19
20 std::allocator<char>::allocator(&v17, argv, envp);
21 std::string::string(v10, "Jack", &v17);
22 v3 = (Human *)operator new(0x18uLL);
23 Man::Man(v3, v10, 25LL);
24 v12 = v3;
25 std::string::~string((std::string *)v10);
26 std::allocator<char>::~allocator(&v17);
27 std::allocator<char>::allocator(v18, v10, v4);
28 std::string::string(v11, "Jill", v18);
29 v5 = (Human *)operator new(0x18uLL);
30 Woman::Woman(v5, v11, 21LL);
31 v13 = v5;
32 std::string::~string((std::string *)v11);
33 std::allocator<char>::~allocator(v18);
34 while ( 1 )
35 {
36 while ( 1 )
37 {
38 while ( 1 )
39 {
40 std::operator<<<std::char_traits<char>>(&std::cout, "1. use
41 2. after
42 3. free
43 ");
44 std::istream::operator>>(&std::cin, &v16);
45 if ( v16 != 2 )
46 break;
47 nbytes = atoi(argv[1]);
48 buf = (void *)operator new[](nbytes);
49 v6 = open(argv[2], 0);
50 read(v6, buf, nbytes);
51 v7 = std::operator<<<std::char_traits<char>>(&std::cout, "your data is allocated");
52 std::ostream::operator<<(v7, &std::endl<char,std::char_traits<char>>);
53 }
54 if ( v16 == 3 )
55 break;
56 if ( v16 == 1 )
57 {
58 (*(void (__fastcall **)(Human *))(*(_QWORD *)v12 + 8LL))(v12);
59 (*(void (__fastcall **)(Human *))(*(_QWORD *)v13 + 8LL))(v13);
60 }
61 }
62 v8 = v12;
63 if ( v12 )
64 {
65 Human::~Human(v12);
66 operator delete(v8);
67 }
68 v9 = v13;
69 if ( v13 )
70 {
71 Human::~Human(v13);
72 operator delete(v9);
73 }
74 }
75 }
```
You would respond with:
## Reasoning
### Expert 1: C/C++ Programming Expert
**Assessment**: The first vulnerability I notice is a potential use-after-free. Specifically, in lines 62-73, there are
deletions of the `v12` and `v13` objects. If the program re-enters this loop and tries to access these pointers without
proper reallocation, it will result in undefined behavior due to accessing freed memory. Additionally, lines 47-50 have
potential for buffer overflow. The size `nbytes` from `argv[1]` is used directly without any checks. If `argv[1]` is a
very large value, it can cause excessive allocation or even wrap around to a small value, potentially leading to an
overflow when reading data.
### Expert 2: Reverse Engineering Expert
**Assessment**: One main issue is the use-after-free vulnerability in lines 62-73. Freeing `v12` and `v13` and then
potentially accessing them in subsequent iterations is problematic. This vulnerability can be exploited to crash the
program or execute arbitrary code. The second notable vulnerability is the insecure handling of `nbytes` in lines
47-50. Without validation, there's a risk that this unbounded value could lead to buffer overflow or memory corruption,
especially if `argv[1]` holds a negative or excessively large number.
### Expert 3: Cybersecurity Analyst
**Assessment**: The use-after-free in lines 62-73 stands out as particularly severe. If the pointers `v12` and `v13`
are accessed after being freed, it can lead to security breaches or application crashes. Another critical point is the
lack of validation for `nbytes` in lines 47-50, which can potentially cause buffer overflow. This lack of sanitization
makes the application prone to memory corruption, which can be a severe security issue and possibly exploitable.
## Answer
{
"vulnerabilities": ["use-after-free (62-73)", "buffer-overflow (47-50)"]
"description": "The code contains a classic use-after-free vulnerability. In lines 62-73, the pointers v12 and v13 (which point to objects of type Human) are deleted (freed) using operator delete. If the program's loop (lines 34-74) executes again and the pointers v12 or v13 are accessed without reallocation, it results in undefined behavior due to use-after-free. In lines 47-50, the code reads a size value from argv[1] and uses it directly with operator new[] to allocate a buffer (buf). There are no checks to ensure that nbytes is a reasonable size, potentially leading to a large allocation or integer overflow."
}
# Example
Given the following code:
```
{{ decompilation }}
```
You respond with:
"""

MAN_PAGE = f"""
{COT_PREAMBLE}
All experts will be asked to write a summarized man page for a function in a decompiled C code.
These summaries should include arg and return information as well as types.
The focal point will be on a function call (that is a library) inside this function.
{COT_MIDAMBLE}
The question is how to write a summarized man page for the target function given all the information we got.
A focal line will be given to do analysis on.
{COT_POSTAMBLE}
""" + """
# Example
Here is an example, given the following code as context:
```
void __fastcall gz_error(__int64 a1, int a2, const char *a3)
{
void *v5; // rcx
__int64 v7; // rbx
__int64 v8; // rax
__int64 v9; // rcx
char *v10; // rax
char *v11; // rcx
const char *v12; // r9
__int64 v13; // rax
v5 = *(void **)(a1 + 120);
if ( v5 )
{
if ( *(_DWORD *)(a1 + 116) != -4 )
free(v5);
*(_QWORD *)(a1 + 120) = 0LL;
}
if ( a2 && a2 != -5 )
*(_DWORD *)a1 = 0;
*(_DWORD *)(a1 + 116) = a2;
if ( a3 && a2 != -4 )
{
v7 = -1LL;
v8 = -1LL;
do
++v8;
while ( *(_BYTE *)(*(_QWORD *)(a1 + 32) + v8) );
v9 = -1LL;
do
++v9;
while ( a3[v9] );
v10 = (char *)malloc(v8 + 3 + v9);
*(_QWORD *)(a1 + 120) = v10;
v11 = v10;
if ( v10 )
{
v12 = *(const char **)(a1 + 32);
v13 = -1LL;
while ( v12[++v13] != 0 )
;
do
++v7;
while ( a3[v7] );
snprintf(v11, v7 + v13 + 3, "%s%s%s", v12, ": ", a3);
}
else
{
*(_DWORD *)(a1 + 116) = -4;
}
}
}
```
You focus on the line in the above text:
```
snprintf(v11, v7 + v13 + 3, "%s%s%s", v12, ": ", a3);
```
Focusing on the outermost function call in this line, you respond with:
## Reasoning
### Expert 1: C Programming Expert
### Expert 2: Reverse Engineering Expert
### Expert 3: Cybersecurity Analyst
## Answer
{
"function": "snprintf",
"args": ["str (char *)", "size (size_t)", "format (const char *)", "..."],
"return": "int",
"description": "The snprintf() function formats and stores a series of characters and values in the array buffer. It is similar to printf(), but with two major differences: it outputs to a buffer rather than stdout, and it takes an additional size parameter specifying the limit of characters to write. The size parameter prevents buffer overflows. It returns the number of characters that would have been written if the buffer was sufficiently large, not counting the terminating null character."
}
# Example
Given the following code as context:
```
{{ decompilation }}
```
You focus on the line in the above text:
```
{{ line_text }}
```
Focusing on the outermost function call in this line, you respond with:
"""
Loading

0 comments on commit cb76283

Please sign in to comment.