aboutsummaryrefslogtreecommitdiff
path: root/gcc/spellcheck.c
diff options
context:
space:
mode:
authorDavid Malcolm <dmalcolm@redhat.com>2015-11-13 01:54:41 +0000
committerDavid Malcolm <dmalcolm@gcc.gnu.org>2015-11-13 01:54:41 +0000
commit277fe616911ac1ce91e9f1178d648303b4a26940 (patch)
tree574b1272f86df52172c56c614532ff5032b6f73d /gcc/spellcheck.c
parent6d82151f8c08d123eaab63409cd8ba6f85d0539c (diff)
downloadgcc-277fe616911ac1ce91e9f1178d648303b4a26940.zip
gcc-277fe616911ac1ce91e9f1178d648303b4a26940.tar.gz
gcc-277fe616911ac1ce91e9f1178d648303b4a26940.tar.bz2
Implement Levenshtein distance; use in C FE for misspelled field names
This is the combination of: [PATCH 1/2] Implement Levenshtein distance [PATCH 2/2] C FE: suggest corrections for misspelled field names plus some nit fixes in spellcheck.c. gcc/ChangeLog: * Makefile.in (OBJS): Add spellcheck.o. * spellcheck.c: New file. * spellcheck.h: New file. gcc/c/ChangeLog: * c-typeck.c: Include spellcheck.h. (lookup_field_fuzzy_find_candidates): New function. (lookup_field_fuzzy): New function. (build_component_ref): If the field was not found, try using lookup_field_fuzzy and potentially offer a suggestion. gcc/testsuite/ChangeLog: * gcc.dg/plugin/levenshtein-test-1.c: New file. * gcc.dg/plugin/levenshtein_plugin.c: New file. * gcc.dg/plugin/plugin.exp (plugin_test_list): Add levenshtein_plugin.c. * gcc.dg/spellcheck-fields.c: New file. From-SVN: r230284
Diffstat (limited to 'gcc/spellcheck.c')
-rw-r--r--gcc/spellcheck.c136
1 files changed, 136 insertions, 0 deletions
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
new file mode 100644
index 0000000..31ce322
--- /dev/null
+++ b/gcc/spellcheck.c
@@ -0,0 +1,136 @@
+/* Find near-matches for strings and identifiers.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "spellcheck.h"
+
+/* The Levenshtein distance is an "edit-distance": the minimal
+ number of one-character insertions, removals or substitutions
+ that are needed to change one string into another.
+
+ This implementation uses the Wagner-Fischer algorithm. */
+
+static edit_distance_t
+levenshtein_distance (const char *s, int len_s,
+ const char *t, int len_t)
+{
+ const bool debug = false;
+
+ if (debug)
+ {
+ printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
+ printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
+ }
+
+ if (len_s == 0)
+ return len_t;
+ if (len_t == 0)
+ return len_s;
+
+ /* We effectively build a matrix where each (i, j) contains the
+ Levenshtein distance between the prefix strings s[0:j]
+ and t[0:i].
+ Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
+ we simply keep track of the last row, v0 and a new row, v1,
+ which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
+ in favor of two (len_s + 1) allocations. These could potentially be
+ statically-allocated if we impose a maximum length on the
+ strings of interest. */
+ edit_distance_t *v0 = new edit_distance_t[len_s + 1];
+ edit_distance_t *v1 = new edit_distance_t[len_s + 1];
+
+ /* The first row is for the case of an empty target string, which
+ we can reach by deleting every character in the source string. */
+ for (int i = 0; i < len_s + 1; i++)
+ v0[i] = i;
+
+ /* Build successive rows. */
+ for (int i = 0; i < len_t; i++)
+ {
+ if (debug)
+ {
+ printf ("i:%i v0 = ", i);
+ for (int j = 0; j < len_s + 1; j++)
+ printf ("%i ", v0[j]);
+ printf ("\n");
+ }
+
+ /* The initial column is for the case of an empty source string; we
+ can reach prefixes of the target string of length i
+ by inserting i characters. */
+ v1[0] = i + 1;
+
+ /* Build the rest of the row by considering neighbours to
+ the north, west and northwest. */
+ for (int j = 0; j < len_s; j++)
+ {
+ edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
+ edit_distance_t deletion = v1[j] + 1;
+ edit_distance_t insertion = v0[j + 1] + 1;
+ edit_distance_t substitution = v0[j] + cost;
+ edit_distance_t cheapest = MIN (deletion, insertion);
+ cheapest = MIN (cheapest, substitution);
+ v1[j + 1] = cheapest;
+ }
+
+ /* Prepare to move on to next row. */
+ for (int j = 0; j < len_s + 1; j++)
+ v0[j] = v1[j];
+ }
+
+ if (debug)
+ {
+ printf ("final v1 = ");
+ for (int j = 0; j < len_s + 1; j++)
+ printf ("%i ", v1[j]);
+ printf ("\n");
+ }
+
+ edit_distance_t result = v1[len_s];
+ delete[] v0;
+ delete[] v1;
+ return result;
+}
+
+/* Calculate Levenshtein distance between two nil-terminated strings.
+ This exists purely for the unit tests. */
+
+edit_distance_t
+levenshtein_distance (const char *s, const char *t)
+{
+ return levenshtein_distance (s, strlen (s), t, strlen (t));
+}
+
+/* Calculate Levenshtein distance between two identifiers. */
+
+edit_distance_t
+levenshtein_distance (tree ident_s, tree ident_t)
+{
+ gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE);
+ gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE);
+
+ return levenshtein_distance (IDENTIFIER_POINTER (ident_s),
+ IDENTIFIER_LENGTH (ident_s),
+ IDENTIFIER_POINTER (ident_t),
+ IDENTIFIER_LENGTH (ident_t));
+}