mirror of
https://github.com/vcmi/vcmi.git
synced 2025-02-03 13:01:33 +02:00
Implement detection of typos in json using Levenshtein Distance
This commit is contained in:
parent
7209969d9d
commit
e09fbe5ea4
@ -21,6 +21,80 @@
|
||||
|
||||
VCMI_LIB_NAMESPACE_BEGIN
|
||||
|
||||
// Algorithm for detection of typos in words
|
||||
// Determines how 'different' two strings are - how many changes must be done to turn one string into another one
|
||||
// https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
|
||||
static int getLevenshteinDistance(const std::string & s, const std::string & t)
|
||||
{
|
||||
int n = t.size();
|
||||
int m = s.size();
|
||||
|
||||
// create two work vectors of integer distances
|
||||
std::vector<int> v0(n+1, 0);
|
||||
std::vector<int> v1(n+1, 0);
|
||||
|
||||
// initialize v0 (the previous row of distances)
|
||||
// this row is A[0][i]: edit distance from an empty s to t;
|
||||
// that distance is the number of characters to append to s to make t.
|
||||
for (int i = 0; i < n; ++i)
|
||||
v0[i] = i;
|
||||
|
||||
for (int i = 0; i < m; ++i)
|
||||
{
|
||||
// calculate v1 (current row distances) from the previous row v0
|
||||
|
||||
// first element of v1 is A[i + 1][0]
|
||||
// edit distance is delete (i + 1) chars from s to match empty t
|
||||
v1[0] = i + 1;
|
||||
|
||||
// use formula to fill in the rest of the row
|
||||
for (int j = 0; j < n; ++j)
|
||||
{
|
||||
// calculating costs for A[i + 1][j + 1]
|
||||
int deletionCost = v0[j + 1] + 1;
|
||||
int insertionCost = v1[j] + 1;
|
||||
int substitutionCost;
|
||||
|
||||
if (s[i] == t[j])
|
||||
substitutionCost = v0[j];
|
||||
else
|
||||
substitutionCost = v0[j] + 1;
|
||||
|
||||
v1[j + 1] = std::min({deletionCost, insertionCost, substitutionCost});
|
||||
}
|
||||
|
||||
// copy v1 (current row) to v0 (previous row) for next iteration
|
||||
// since data in v1 is always invalidated, a swap without copy could be more efficient
|
||||
std::swap(v0, v1);
|
||||
}
|
||||
|
||||
// after the last swap, the results of v1 are now in v0
|
||||
return v0[n];
|
||||
}
|
||||
|
||||
/// Searches for keys similar to 'target' in 'candidates' map
|
||||
/// Returns closest match or empty string if no suitable candidates are found
|
||||
static std::string findClosestMatch(JsonMap candidates, std::string target)
|
||||
{
|
||||
// Maximum distance at which we can consider strings to be similar
|
||||
// If strings have more different symbols than this number then it is not a typo, but a completely different word
|
||||
static constexpr int maxDistance = 5;
|
||||
int bestDistance = maxDistance;
|
||||
std::string bestMatch;
|
||||
|
||||
for (auto const & candidate : candidates)
|
||||
{
|
||||
int newDistance = getLevenshteinDistance(candidate.first, target);
|
||||
|
||||
if (newDistance < bestDistance)
|
||||
{
|
||||
bestDistance = newDistance;
|
||||
bestMatch = candidate.first;
|
||||
}
|
||||
}
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
static std::string emptyCheck(JsonValidator & validator, const JsonNode & baseSchema, const JsonNode & schema, const JsonNode & data)
|
||||
{
|
||||
// check is not needed - e.g. incorporated into another check
|
||||
@ -417,7 +491,13 @@ static std::string additionalPropertiesCheck(JsonValidator & validator, const Js
|
||||
|
||||
// or, additionalItems field can be bool which indicates if such items are allowed
|
||||
else if(!schema.isNull() && !schema.Bool()) // present and set to false - error
|
||||
errors += validator.makeErrorMessage("Unknown entry found: " + entry.first);
|
||||
{
|
||||
std::string bestCandidate = findClosestMatch(baseSchema["properties"].Struct(), entry.first);
|
||||
if (!bestCandidate.empty())
|
||||
errors += validator.makeErrorMessage("Unknown entry found: '" + entry.first + "'. Perhaps you meant '" + bestCandidate + "'?");
|
||||
else
|
||||
errors += validator.makeErrorMessage("Unknown entry found: " + entry.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
return errors;
|
||||
|
Loading…
x
Reference in New Issue
Block a user