diff options
author | Tom de Vries <tdevries@suse.de> | 2019-11-07 10:49:56 +0100 |
---|---|---|
committer | Tom de Vries <tdevries@suse.de> | 2019-11-07 10:49:56 +0100 |
commit | 496af5c81112807c9909fb7038404905e15950ea (patch) | |
tree | e1bf4a85f8ebce5eb36ac04cda7cbe3862e06fef | |
parent | 595d3787e9cbedbceb6182f873a4774707c0e74f (diff) | |
download | gdb-496af5c81112807c9909fb7038404905e15950ea.zip gdb-496af5c81112807c9909fb7038404905e15950ea.tar.gz gdb-496af5c81112807c9909fb7038404905e15950ea.tar.bz2 |
[gdb/contrib] Add words.sh script
Add a script that takes a list of files as arguments and output a list of
words from the C comments with their frequencies.
For:
...
$ ./gdb/contrib/words.sh $(find gdb -type f -name "*.c" -o -name "*.h")
...
it generates a list of ~15000 words prefixed with frequency.
This could be used to generate a dictionary that is kept as part of the
sources, against which new code can be checked, generating a warning or
error. The hope is that misspellings would trigger this frequently, and rare
words rarely, otherwise the burden of updating the dictionary would be too
much.
And for:
...
$ ./gdb/contrib/words.sh -f 1 $(find gdb -type f -name "*.c" -o -name "*.h")
...
it generates a list of ~5000 words with frequency 1.
This can be used to scan for misspellings manually.
Change-Id: I7b119c9a4519cdbf62a3243d1df2927c80813e8b
-rwxr-xr-x | gdb/contrib/words.sh | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/gdb/contrib/words.sh b/gdb/contrib/words.sh new file mode 100755 index 0000000..ae38539 --- /dev/null +++ b/gdb/contrib/words.sh @@ -0,0 +1,129 @@ +#!/bin/sh + +# Copyright (C) 2019 Free Software Foundation, Inc. +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# This script intends to facilitate spell checking of comments in C sources. +# It: +# - extracts comments from C files +# - transforms the comments into a list of lowercase words +# - prefixes each word with the frequency +# - filters out words within a frequency range +# - sorts the words, longest first +# +# For: +# ... +# $ ./gdb/contrib/words.sh $(find gdb -type f -name "*.c" -o -name "*.h") +# ... +# it generates a list of ~15000 words prefixed with frequency. +# +# This could be used to generate a dictionary that is kept as part of the +# sources, against which new code can be checked, generating a warning or +# error. The hope is that misspellings would trigger this frequently, and rare +# words rarely, otherwise the burden of updating the dictionary would be too +# much. +# +# And for: +# ... +# $ ./gdb/contrib/words.sh -f 1 $(find gdb -type f -name "*.c" -o -name "*.h") +# ... +# it generates a list of ~5000 words with frequency 1. +# +# This can be used to scan for misspellings manually. +# + +minfreq= +maxfreq= +while [ $# -gt 0 ]; do + case "$1" in + --freq|-f) + minfreq=$2 + maxfreq=$2 + shift 2 + ;; + --min) + minfreq=$2 + if [ "$maxfreq" = "" ]; then + maxfreq=0 + fi + shift 2 + ;; + --max) + maxfreq=$2 + if [ "$minfreq" = "" ]; then + minfreq=0 + fi + shift 2 + ;; + *) + break; + ;; + esac +done + +if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then + minfreq=0 + maxfreq=0 +fi + +awkfile=$(mktemp) +trap 'rm -f "$awkfile"' EXIT + +cat > "$awkfile" <<EOF +BEGIN { + in_comment=0 +} + +// { + line=\$0 +} + +/\/\*/ { + in_comment=1 + sub(/.*\/\*/, "", line) +} + +/\*\// { + sub(/\*\/.*/, "", line) + in_comment=0 + print line + next +} + +// { + if (in_comment) { + print line + } +} +EOF + +# Stabilize sort. +export LC_ALL=C + +awk \ + -f "$awkfile" \ + -- "$@" \ + | sed 's/[%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \ + | sed 's/\[/\n/g' \ + | sed 's/\]/\n/g' \ + | sed 's/[0-9][0-9]*/\n/g' \ + | tr '[:upper:]' '[:lower:]' \ + | sed 's/[ \t]*//g' \ + | sort \ + | uniq -c \ + | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \ + && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \ + | awk '{ print length($0) " " $0; }' \ + | sort -n -r \ + | cut -d ' ' -f 2- |