Skip to content

Commit 8488144

Browse files
chore: allow adding custom parameter scorers
In some cases, find, aggregate, count, explain, deleteMany, etc we need to grade extra provided arguments depending on the prompt itself. Sometimes additional parameters are fine and sometimes they are not. For example: increasing the keys in filter might lead to a different result hence if any such thing happens, we should grade the accuracy as 0 and not 0.75. To suppor this use-case, this commit introduces the idea of a custom scorer that could be plugged in to accuracy scorer to provided more controlled accuracy grading. Additionally this commit reverts the default behaviour of handling added parameters. Earlier we were marking newly added parameters as hallucinations and hence grading 0.75. But now, after figuring out that most of our tools don't even expect extra parameters, we are flipping the switch and instead will now grade 0 when additional parameters are specified, unless there is a scorer provided to handle the custom scoring logic.
1 parent 6532c7a commit 8488144

12 files changed

+475
-141
lines changed

tests/accuracy/aggregate.test.ts

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
2+
import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js";
23

34
describeAccuracyTests([
45
{
56
prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them",
67
expectedToolCalls: [
78
{
89
toolName: "aggregate",
9-
parameters: {
10-
pipeline: { $group: { _id: "$release_year", count: { $sum: 1 } } },
11-
},
10+
parameters: withParameterScorer(
11+
{
12+
database: "mflix",
13+
collection: "movies",
14+
pipeline: [{ $group: { _id: "$release_year", count: { $sum: 1 } } }],
15+
},
16+
// There should not be a $match at all hence the custom matcher
17+
ParameterScorers.noAdditionsAllowedForPaths(["pipeline.0.$match"])
18+
),
1219
},
1320
],
1421
},

tests/accuracy/collectionSchema.test.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,25 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
22

33
describeAccuracyTests([
44
{
5-
prompt: "Is there a title field in 'db1.coll1' namespace?",
5+
prompt: "Is there a title field in 'mflix.movies' namespace?",
66
expectedToolCalls: [
77
{
88
toolName: "collection-schema",
99
parameters: {
10-
database: "db1",
11-
collection: "coll1",
10+
database: "mflix",
11+
collection: "movies",
1212
},
1313
},
1414
],
1515
},
1616
{
17-
prompt: "What is the type of value stored in title field in coll1 collection in db1 database?",
17+
prompt: "What is the type of value stored in title field in movies collection in mflix database?",
1818
expectedToolCalls: [
1919
{
2020
toolName: "collection-schema",
2121
parameters: {
22-
database: "db1",
23-
collection: "coll1",
22+
database: "mflix",
23+
collection: "movies",
2424
},
2525
},
2626
],

tests/accuracy/count.test.ts

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
2+
import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js";
23

34
describeAccuracyTests([
45
{
56
prompt: "Count number of documents in 'mflix.movies' namespace.",
67
expectedToolCalls: [
78
{
89
toolName: "count",
9-
parameters: {
10-
database: "mflix",
11-
collection: "movies",
12-
},
10+
parameters: withParameterScorer(
11+
{
12+
database: "mflix",
13+
collection: "movies",
14+
},
15+
ParameterScorers.emptyAdditionsAllowedForPaths(["query"])
16+
),
1317
},
1418
],
1519
},
@@ -18,10 +22,13 @@ describeAccuracyTests([
1822
expectedToolCalls: [
1923
{
2024
toolName: "count",
21-
parameters: {
22-
database: "comics",
23-
collection: "characters",
24-
},
25+
parameters: withParameterScorer(
26+
{
27+
database: "comics",
28+
collection: "characters",
29+
},
30+
ParameterScorers.emptyAdditionsAllowedForPaths(["query"])
31+
),
2532
},
2633
],
2734
},
@@ -30,11 +37,14 @@ describeAccuracyTests([
3037
expectedToolCalls: [
3138
{
3239
toolName: "count",
33-
parameters: {
34-
database: "mflix",
35-
collection: "movies",
36-
query: { runtime: { $lt: 100 } },
37-
},
40+
parameters: withParameterScorer(
41+
{
42+
database: "mflix",
43+
collection: "movies",
44+
query: { runtime: { $lt: 100 } },
45+
},
46+
ParameterScorers.noAdditionsAllowedForPaths(["query"])
47+
),
3848
},
3949
],
4050
},

tests/accuracy/createIndex.test.ts

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
2+
import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js";
23

34
describeAccuracyTests([
45
{
56
prompt: "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }",
67
expectedToolCalls: [
78
{
89
toolName: "create-index",
9-
parameters: {
10-
database: "mflix",
11-
collection: "movies",
12-
keys: {
13-
release_year: 1,
10+
parameters: withParameterScorer(
11+
{
12+
database: "mflix",
13+
collection: "movies",
14+
keys: {
15+
release_year: 1,
16+
},
1417
},
15-
},
18+
ParameterScorers.noAdditionsAllowedForPaths(["keys"])
19+
),
1620
},
1721
],
1822
},
@@ -21,13 +25,16 @@ describeAccuracyTests([
2125
expectedToolCalls: [
2226
{
2327
toolName: "create-index",
24-
parameters: {
25-
database: "mflix",
26-
collection: "movies",
27-
keys: {
28-
title: "text",
28+
parameters: withParameterScorer(
29+
{
30+
database: "mflix",
31+
collection: "movies",
32+
keys: {
33+
title: "text",
34+
},
2935
},
30-
},
36+
ParameterScorers.noAdditionsAllowedForPaths(["keys"])
37+
),
3138
},
3239
],
3340
},

tests/accuracy/deleteMany.test.ts

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
2+
import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js";
23

34
describeAccuracyTests([
45
{
56
prompt: "Delete all the documents from 'mflix.movies' namespace",
67
expectedToolCalls: [
78
{
89
toolName: "delete-many",
9-
parameters: {
10-
database: "mflix",
11-
collection: "movies",
12-
},
10+
parameters: withParameterScorer(
11+
{
12+
database: "mflix",
13+
collection: "movies",
14+
},
15+
ParameterScorers.emptyAdditionsAllowedForPaths(["filter"])
16+
),
1317
},
1418
],
1519
},
@@ -18,10 +22,13 @@ describeAccuracyTests([
1822
expectedToolCalls: [
1923
{
2024
toolName: "delete-many",
21-
parameters: {
22-
database: "mflix",
23-
collection: "movies",
24-
},
25+
parameters: withParameterScorer(
26+
{
27+
database: "mflix",
28+
collection: "movies",
29+
},
30+
ParameterScorers.emptyAdditionsAllowedForPaths(["filter"])
31+
),
2532
},
2633
],
2734
},
@@ -30,10 +37,14 @@ describeAccuracyTests([
3037
expectedToolCalls: [
3138
{
3239
toolName: "delete-many",
33-
parameters: {
34-
database: "mflix",
35-
collection: "movies",
36-
},
40+
parameters: withParameterScorer(
41+
{
42+
database: "mflix",
43+
collection: "movies",
44+
filter: { runtime: { $lt: 100 } },
45+
},
46+
ParameterScorers.noAdditionsAllowedForPaths(["filter"])
47+
),
3748
},
3849
],
3950
},

tests/accuracy/explain.test.ts

Lines changed: 50 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
2+
import { ParameterScorers, withParameterScorer } from "./sdk/parameterScorer.js";
23

34
/**
45
* None of these tests score a parameter match on any of the models, likely
@@ -11,62 +12,72 @@ describeAccuracyTests([
1112
expectedToolCalls: [
1213
{
1314
toolName: "explain",
14-
parameters: {
15-
database: "mflix",
16-
collection: "movies",
17-
method: [
18-
{
19-
name: "find",
20-
arguments: {
21-
filter: { release_year: 2020 },
15+
parameters: withParameterScorer(
16+
{
17+
database: "mflix",
18+
collection: "movies",
19+
method: [
20+
{
21+
name: "find",
22+
arguments: {
23+
filter: { release_year: 2020 },
24+
},
2225
},
23-
},
24-
],
25-
},
26+
],
27+
},
28+
// Any addition to method itself will essentially change the explain output
29+
ParameterScorers.noAdditionsAllowedForPaths(["method"])
30+
),
2631
},
2732
],
2833
},
2934
{
30-
prompt: `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`,
35+
prompt: `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`,
3136
expectedToolCalls: [
3237
{
3338
toolName: "explain",
34-
parameters: {
35-
database: "mflix",
36-
collection: "movies",
37-
method: [
38-
{
39-
name: "aggregate",
40-
arguments: {
41-
pipeline: [
42-
{
43-
$match: { release_year: 2020 },
44-
},
45-
],
39+
parameters: withParameterScorer(
40+
{
41+
database: "mflix",
42+
collection: "movies",
43+
method: [
44+
{
45+
name: "aggregate",
46+
arguments: {
47+
pipeline: [
48+
{
49+
$match: { release_year: 2020 },
50+
},
51+
],
52+
},
4653
},
47-
},
48-
],
49-
},
54+
],
55+
},
56+
ParameterScorers.noAdditionsAllowedForPaths(["method"])
57+
),
5058
},
5159
],
5260
},
5361
{
54-
prompt: `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`,
62+
prompt: `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`,
5563
expectedToolCalls: [
5664
{
5765
toolName: "explain",
58-
parameters: {
59-
database: "mflix",
60-
collection: "movies",
61-
method: [
62-
{
63-
name: "count",
64-
arguments: {
65-
query: { release_year: 2020 },
66+
parameters: withParameterScorer(
67+
{
68+
database: "mflix",
69+
collection: "movies",
70+
method: [
71+
{
72+
name: "count",
73+
arguments: {
74+
query: { release_year: 2020 },
75+
},
6676
},
67-
},
68-
],
69-
},
77+
],
78+
},
79+
ParameterScorers.noAdditionsAllowedForPaths(["method"])
80+
),
7081
},
7182
],
7283
},

0 commit comments

Comments
 (0)