1
0
mirror of https://github.com/vcmi/vcmi.git synced 2025-02-03 13:01:33 +02:00

Implement detection of typos in json using Levenshtein Distance

This commit is contained in:
Ivan Savenko 2024-09-15 20:14:08 +00:00
parent 7209969d9d
commit e09fbe5ea4

View File

@ -21,6 +21,80 @@
VCMI_LIB_NAMESPACE_BEGIN
// Algorithm for detection of typos in words
// Determines how 'different' two strings are - how many changes must be done to turn one string into another one
// https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
static int getLevenshteinDistance(const std::string & s, const std::string & t)
{
int n = t.size();
int m = s.size();
// create two work vectors of integer distances
std::vector<int> v0(n+1, 0);
std::vector<int> v1(n+1, 0);
// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance from an empty s to t;
// that distance is the number of characters to append to s to make t.
for (int i = 0; i < n; ++i)
v0[i] = i;
for (int i = 0; i < m; ++i)
{
// calculate v1 (current row distances) from the previous row v0
// first element of v1 is A[i + 1][0]
// edit distance is delete (i + 1) chars from s to match empty t
v1[0] = i + 1;
// use formula to fill in the rest of the row
for (int j = 0; j < n; ++j)
{
// calculating costs for A[i + 1][j + 1]
int deletionCost = v0[j + 1] + 1;
int insertionCost = v1[j] + 1;
int substitutionCost;
if (s[i] == t[j])
substitutionCost = v0[j];
else
substitutionCost = v0[j] + 1;
v1[j + 1] = std::min({deletionCost, insertionCost, substitutionCost});
}
// copy v1 (current row) to v0 (previous row) for next iteration
// since data in v1 is always invalidated, a swap without copy could be more efficient
std::swap(v0, v1);
}
// after the last swap, the results of v1 are now in v0
return v0[n];
}
/// Searches for keys similar to 'target' in 'candidates' map
/// Returns closest match or empty string if no suitable candidates are found
static std::string findClosestMatch(JsonMap candidates, std::string target)
{
// Maximum distance at which we can consider strings to be similar
// If strings have more different symbols than this number then it is not a typo, but a completely different word
static constexpr int maxDistance = 5;
int bestDistance = maxDistance;
std::string bestMatch;
for (auto const & candidate : candidates)
{
int newDistance = getLevenshteinDistance(candidate.first, target);
if (newDistance < bestDistance)
{
bestDistance = newDistance;
bestMatch = candidate.first;
}
}
return bestMatch;
}
static std::string emptyCheck(JsonValidator & validator, const JsonNode & baseSchema, const JsonNode & schema, const JsonNode & data)
{
// check is not needed - e.g. incorporated into another check
@ -417,7 +491,13 @@ static std::string additionalPropertiesCheck(JsonValidator & validator, const Js
// or, additionalItems field can be bool which indicates if such items are allowed
else if(!schema.isNull() && !schema.Bool()) // present and set to false - error
errors += validator.makeErrorMessage("Unknown entry found: " + entry.first);
{
std::string bestCandidate = findClosestMatch(baseSchema["properties"].Struct(), entry.first);
if (!bestCandidate.empty())
errors += validator.makeErrorMessage("Unknown entry found: '" + entry.first + "'. Perhaps you meant '" + bestCandidate + "'?");
else
errors += validator.makeErrorMessage("Unknown entry found: " + entry.first);
}
}
}
return errors;