Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(invariant): fuzz with values from events and return values #7666

Merged
merged 30 commits into from
May 20, 2024

Conversation

grandizzy
Copy link
Collaborator

@grandizzy grandizzy commented Apr 15, 2024

Motivation

Ref #51
https://forum.openzeppelin.com/t/using-automatic-analysis-tools-with-makerdao-contracts/1021/2

Solution

  • do not collect state from call if call revert
  • add BasicTxDetails and CallTargetDetails structs
  • using abi and function from identified fuzzed contracts, decode call result (if function has outputs) and logs (if any) and insert samples collected in dictionary, per var type. Samples are stored up to configured test run depth and are persisted / reused between test runs
  • when fuzzing a param from state dict, use saved samples of param type with a weight of 50
  • some code cleanup and tests for collecting values from result and event log

Tests

forge test --mt invariant_check_dschief_with_return_value
contract SimpleDSChief {
    mapping(bytes32 => address) public slates;
    mapping(address => bytes32) public votes;
    mapping(address => uint256) public approvals;
    mapping(address => uint256) public deposits;
    bool public hacked = false;

    function lock(uint256 wad) public {
        deposits[msg.sender] = add(deposits[msg.sender], wad);
        addWeight(wad, votes[msg.sender]);
    }

    function free(uint256 wad) public {
        deposits[msg.sender] = sub(deposits[msg.sender], wad);
        subWeight(wad, votes[msg.sender]);
    }

    function voteYays(address yay) public returns (bytes32) {
        bytes32 slate = etch(yay);
        voteSlate(slate);
        return slate;
    }

    function etch(address yay) public returns (bytes32 slate) {
        bytes32 hash = keccak256(abi.encodePacked(yay));
        slates[hash] = yay;
        return hash;
    }

    function voteSlate(bytes32 slate) public {
        uint256 weight = deposits[msg.sender];
        subWeight(weight, votes[msg.sender]);
        votes[msg.sender] = slate;
        addWeight(weight, votes[msg.sender]);
    }

    function addWeight(uint256 weight, bytes32 slate) internal {
        address yay = slates[slate];
        approvals[yay] = add(approvals[yay], weight);
    }

    function subWeight(uint256 weight, bytes32 slate) internal {
        address yay = slates[slate];
        approvals[yay] = sub(approvals[yay], weight);
    }

    function add(uint256 x, uint256 y) internal pure returns (uint256 z) {
        require((z = x + y) >= x);
    }

    function sub(uint256 x, uint256 y) internal pure returns (uint256 z) {
        require((z = x - y) <= x);
    }

    function checkInvariant() public {
        bytes32 senderSlate = votes[msg.sender];
        address option = slates[senderSlate];
        uint256 senderDeposit = deposits[msg.sender];
        if (approvals[option] < senderDeposit) {
            hacked = true;
        }
    }
}

contract SimpleDSChiefTest is Test {
    SimpleDSChief dsChief;

    function setUp() public {
        dsChief = new SimpleDSChief();
        targetContract(address(dsChief));
        targetSender(address(0x10000));
        targetSender(address(0x20000));
        targetSender(address(0x30000));
    }

    /// forge-config: default.invariant.runs = 500
    /// forge-config: default.invariant.depth = 500
    function invariant_check_dschief_with_return_value() public view {
        assertFalse(dsChief.hacked());
    }
}
forge test --mt invariant_check_dschief_with_event
contract SimpleDSChiefWithEvent {
    event Slate(bytes32 indexed slate);
    mapping(bytes32 => address) public slates;
    mapping(address => bytes32) public votes;
    mapping(address => uint256) public approvals;
    mapping(address => uint256) public deposits;

    bool public hacked = false;

    function lock(uint256 wad) public {
        deposits[msg.sender] = add(deposits[msg.sender], wad);
        addWeight(wad, votes[msg.sender]);
    }

    function free(uint256 wad) public {
        deposits[msg.sender] = sub(deposits[msg.sender], wad);
        subWeight(wad, votes[msg.sender]);
    }

    function voteYays(address yay) public {
        bytes32 hash = keccak256(abi.encodePacked(yay));
        slates[hash] = yay;
        voteSlate(hash);
    }

    function etch(address yay) public {
        bytes32 hash = keccak256(abi.encodePacked(yay));
        slates[hash] = yay;
        emit Slate(hash);
    }

    function voteSlate(bytes32 slate) public {
        uint256 weight = deposits[msg.sender];
        subWeight(weight, votes[msg.sender]);
        votes[msg.sender] = slate;
        addWeight(weight, votes[msg.sender]);
    }

    function addWeight(uint256 weight, bytes32 slate) internal {
        address yay = slates[slate];
        approvals[yay] = add(approvals[yay], weight);
    }

    function subWeight(uint256 weight, bytes32 slate) internal {
        address yay = slates[slate];
        approvals[yay] = sub(approvals[yay], weight);
    }

    function add(uint256 x, uint256 y) internal pure returns (uint256 z) {
        require((z = x + y) >= x);
    }

    function sub(uint256 x, uint256 y) internal pure returns (uint256 z) {
        require((z = x - y) <= x);
    }

    function checkInvariant() public {
        bytes32 senderSlate = votes[msg.sender];
        address option = slates[senderSlate];
        uint256 senderDeposit = deposits[msg.sender];

        if (approvals[option] < senderDeposit) {
            hacked = true;
        }
    }
}

contract SimpleDSChiefWithEventTest is Test {
    SimpleDSChiefWithEvent dsChief;

    function setUp() public {
        dsChief = new SimpleDSChiefWithEvent();
        targetContract(address(dsChief));
        targetSender(address(0x10000));
        targetSender(address(0x20000));
        targetSender(address(0x30000));
    }

    /// forge-config: default.invariant.runs = 500
    /// forge-config: default.invariant.depth = 500
    function invariant_check_dschief_with_event() public view {
        assertFalse(dsChief.hacked());
    }
}
CC @klkvr @mds1

@grandizzy
Copy link
Collaborator Author

With latest PR changes foundry can catch DSChief bug in about 30 seconds (500 runs / 500 depth, missing it ~ 2 out of 10 times) and never missed in ~ 110 seconds (2000 runs with depth of 500)

Ran 1 test suite in 28.02s (28.02s CPU time): 0 tests passed, 1 failed, 0 skipped (1 total tests)
Failing tests:
Encountered 1 failing test in test/research/vera_dschief.t.sol:SimpleDSChiefTest
[FAIL. Reason: assertion failed]
        [Sequence]
                sender=0x0000000000000000000000000000000000030000 addr=[test/research/vera_dschief.t.sol:SimpleDSChief]0x5615dEB798BB3E4dFa0139dFa1b3D433Cc23b72f calldata=voteSlate(bytes32) args=[0xe100d6d47cca6bc77fdd20fbcbbfc6c8db6b9e81a47e4bda6afca133036d0ab5]
                sender=0x0000000000000000000000000000000000030000 addr=[test/research/vera_dschief.t.sol:SimpleDSChief]0x5615dEB798BB3E4dFa0139dFa1b3D433Cc23b72f calldata=lock(uint256) args=[1]
                sender=0x0000000000000000000000000000000000010000 addr=[test/research/vera_dschief.t.sol:SimpleDSChief]0x5615dEB798BB3E4dFa0139dFa1b3D433Cc23b72f calldata=etch(address) args=[0x00000000000000000000000000000000000008DA]
                sender=0x0000000000000000000000000000000000030000 addr=[test/research/vera_dschief.t.sol:SimpleDSChief]0x5615dEB798BB3E4dFa0139dFa1b3D433Cc23b72f calldata=checkInvariant() args=[]
 invariant_check_dschief() (runs: 500, calls: 249548, reverts: 32928)

The proposed solution is to:

  • collect a number of samples from target selectors return values and from events. These samples are limited to configured test run depth.
  • samples are persisted and applied across runs with a weight of 40 (so when fuzzing from state there'll be 60% values generated from unique values collected during run and 40% from values reused across runs)
  • solution favours multiple runs with average depth configured (rather than testing with less runs with big depth) - so issues like the DSChief one likely won't be caught if running let's say 10 times with depth of thousands
  • the number of samples and their randomness weight can be made configurable options (with defaults described above) but this will increase the UX complexity, so not sure about

Further improvement that can be considered is to make samples more efficient by collecting and applying them per type. However this introduce code complexity as we'll have to decode results / events with proper target abi and also to update fuzz strategies to take the fuzzed types into account.

@mds1 @klkvr would love to hear your thoughts re this approach, thanks

…ing. Decode results and persist per types. Use typed samples when fuzzing from state.
@grandizzy
Copy link
Collaborator Author

grandizzy commented Apr 18, 2024

I went ahead and committed a change to collect and apply samples per result type and with this approach was able to reproduce the DSChief bug consistently in preliminary testing, with same 500 / 500.
Also, since these samples are now collected and targeted applied (so if you fuzz an address from state fuzzer makes sure value used from collected samples is of address type) I don't think making configurable options for number of samples and randomness would make sense anymore.

Copy link
Member

@klkvr klkvr left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this makes a lot of sense, the next step should probably be to improve the way we handle logs and storage

only have a couple comments

crates/evm/fuzz/src/strategies/state.rs Outdated Show resolved Hide resolved
Comment on lines 211 to 212
if rand::thread_rng().gen_range(0..100) < 50 {
typed_samples
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we should use prop_perturb here as randomness source

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reorged this part in 58fc5a9 not sure we still need perturb, pls lmk wdyt

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see we are still using rand::thread_rng() which I believe causes proptest runs with the same seed generating different outputs, so perhaps we still need prop_perturb to ensure deterministic randomness?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, I added a slightly different way using (0..100).prop_flat_map(Just) in d00c192 pls check

// Decode result and collect samples to be used in subsequent fuzz runs.
if !result.is_empty() {
if let Ok(decoded_result) = func.abi_decode_output(result, true) {
dict.insert_sample_value(decoded_result, run_depth);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the motivation behind using run_depth as a limit for values of a certain type?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a limit for samples to remain relevant, if it is too low then we're going to use same values / repeat same test too many times, at the same time if it grows too big then samples won't be exercised enough to reveal failures due to potential dependencies.
The test depth is a limit I come up to make sure that, on the extreme case where we run with test depth of n for a fuzz target containing same n functions with one return value, we collect one sample from each function.
We could also introduce a new config for such to replace default behavior.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test depth is a limit I come up to make sure that, on the extreme case where we run with test depth of n for a fuzz target containing same n functions with one return value, we collect one sample from each function.

Is this assumption true given how fuzz targets and selectors are chosen? i.e. I don't think there's a guarantee you cover all n functions with a depth of n, especially if there's >1 contract. Also related to #2986

But regardless, I'll echo my thoughts in #7666 (comment) — performance-wise (catching DSChief) this PR looks really good, and I think it's better to ship as-is and revisit this limit once we have benchmarks, than to block this PR on deciding a good value. Since before benchmarks we're just taking our best guess at what sensible values are anyway :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this assumption true given how fuzz targets and selectors are chosen? i.e. I don't think there's a guarantee you cover all n functions with a depth of n, especially if there's >1 contract. Also related to #2986

Indeed, that is not a guarantee just the ideal case, as you say it can be improved as we go. Only concern could be that if this limit is too low then other relevant data collected won't be exercised enough.

@grandizzy grandizzy changed the title feat(invariant): scrape return values and add to fuzz dictionary feat(invariant): scrape logs and return values and collect samples Apr 21, 2024
@grandizzy
Copy link
Collaborator Author

this makes a lot of sense, the next step should probably be to improve the way we handle logs and storage

I added logs decoding in d9d8619 , will track storage handle in follow up PR (to keep scope limited for this one) if you OK with

@grandizzy grandizzy marked this pull request as ready for review April 21, 2024 11:29
@@ -247,7 +249,17 @@ impl<'a> InvariantExecutor<'a> {
let mut state_changeset =
call_result.state_changeset.to_owned().expect("no changesets");

collect_data(&mut state_changeset, sender, &call_result, &fuzz_state);
if !&call_result.reverted {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there's any valuable data we could collect from a reverted call, hence adding this, pls let me know if I am missing something. (Further improvement when fail on revert set to false is to remove calls reverted from final sequence - should improve shrinking performance)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed that we don't need to collect data from a reverted call

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 , will follow up with a PR to exclude reverted call from final sequence if running with fail-on-revert=false, should improve shrinking phase a lot

@grandizzy grandizzy requested a review from klkvr April 21, 2024 11:33
@grandizzy grandizzy marked this pull request as draft April 23, 2024 07:44
@mds1
Copy link
Collaborator

mds1 commented Apr 26, 2024

In general the approach described here makes a lot of sense to me, definitely supportive. Regarding:

the number of samples and their randomness weight can be made configurable options (with defaults described above) but this will increase the UX complexity, so not sure about

I think your defaults seem pretty reasonable, so we should avoid increasing UX complexity or spending too much time debating param defaults until we have confidence that the changes matter or that the added complexity is worth it. And we can gain that confidence once we have good benchmarks to compare against. So my suggestion would be to stick what we have here, but document all the defaults/assumptions somewhere (I'm indifferent as to where), that way we can easily revisit these decisions and adjust based on data later

@grandizzy grandizzy marked this pull request as ready for review April 26, 2024 11:20
(sender, contract)
})
})
.prop_map(|(sender, call_details)| BasicTxDetails::new(sender, call_details))
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please validate this addition, don't see any issue with but confirmation would be great

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry could you give a tldr on this change? are we just transforming the type here?

Copy link
Collaborator Author

@grandizzy grandizzy Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, it's just a code cleanup related change, that is to create a BasicTxDetails which is now a struct (used to be type) introduced in 55fd876

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got it, seems reasonable to me but deferring to @klkvr since it's an implementation question :)

@grandizzy grandizzy marked this pull request as draft May 14, 2024 18:30
@grandizzy grandizzy changed the title feat(invariant): scrape logs and return values and collect samples WIP feat(invariant): fuzz with values from events and return values May 14, 2024
@grandizzy grandizzy force-pushed the scrape-result branch 2 times, most recently from 7968802 to 481971a Compare May 15, 2024 12:17
@grandizzy grandizzy changed the title WIP feat(invariant): fuzz with values from events and return values feat(invariant): fuzz with values from events and return values May 15, 2024
@grandizzy grandizzy marked this pull request as ready for review May 15, 2024 13:10
Copy link
Member

@mattsse mattsse left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lgtm

Copy link
Member

@klkvr klkvr left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lgtm

@grandizzy grandizzy merged commit 1ddea96 into foundry-rs:master May 20, 2024
19 checks passed
@grandizzy grandizzy deleted the scrape-result branch May 20, 2024 07:22
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants