From 22e02a9d06892cbf0a6c60a70b00c6b014fd751c Mon Sep 17 00:00:00 2001 From: Martin Mathieson Date: Sat, 5 Sep 2020 22:23:52 +0100 Subject: Add spell-checking script. check_spelling.py scans Wireshark source or documentation files, using the general dictionary from pyspellcheck, augmented by the contents of wireshark_words.txt. Can scan: - entire folders (recursively) - individual files - open files - files affected by recent git changes --- tools/check_spelling.py | 390 +++++++++++++++++++++ tools/wireshark_words.txt | 872 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1262 insertions(+) create mode 100755 tools/check_spelling.py create mode 100644 tools/wireshark_words.txt diff --git a/tools/check_spelling.py b/tools/check_spelling.py new file mode 100755 index 0000000000..ac5d8d283b --- /dev/null +++ b/tools/check_spelling.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +# Wireshark - Network traffic analyzer +# By Gerald Combs +# Copyright 1998 Gerald Combs +# +# SPDX-License-Identifier: GPL-2.0-or-later + +import os +import re +import subprocess +import argparse +import signal +from collections import Counter + +# Looks for spelling errors among strings found in source or documentation files. +# TODO: deal with contractions - pyspellcheck doesn't seem to handle apostrophies.. + +# For text colouring/highlighting. +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + ADDED = '\033[45m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + +# Try to exit soon after Ctrl-C is pressed. +should_exit = False + +def signal_handler(sig, frame): + global should_exit + should_exit = True + print('You pressed Ctrl+C - exiting') + +signal.signal(signal.SIGINT, signal_handler) + + + +# Create spellchecker, and augment with some Wireshark words. +from spellchecker import SpellChecker +# Set up our dict with words from text file. +spell = SpellChecker() +spell.word_frequency.load_text_file('./tools/wireshark_words.txt') + + +# Track words that were not found. +missing_words = [] + + +# Split camelCase string into separate words. +def camelCaseSplit(identifier): + matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier) + return [m.group(0) for m in matches] + + +# A File object contains all of the strings to be checked for a given file. +class File: + def __init__(self, file): + self.file = file + self.values = [] + + filename, extension = os.path.splitext(file) + self.code_file = extension in {'.c', '.cpp'} + + + with open(file, 'r') as f: + contents = f.read() + + if self.code_file: + # Remove comments so as not to trip up RE. + contents = removeComments(contents) + + # Find protocol name and add to dict. + # N.B. doesn't work when a variable is used instead of a literal for the protocol name... + matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents) + for m in matches: + protocol = m.group(3) + # Add to dict. + spell.word_frequency.load_words([protocol]) + spell.known([protocol]) + print('Protocol is: ' + bcolors.BOLD + protocol + bcolors.ENDC) + + # Add a string found in this file. + def add(self, value): + self.values.append(value) + + # Whole word is not recognised, but is it 2 words concatenated (without camelcase) ? + def checkMultiWords(self, word): + if len(word) < 6: + return False + + # Don't consider if mixed cases. + if not (word.islower() or word.isupper()): + # But make an exception if only the fist letter is uppercase.. + if not word == (word[0].upper() + word[1:]): + return False + + # Try splitting into 2 words recognised at various points. + length = len(word) + for idx in range(3, length-3): + word1 = word[0:idx] + word2 = word[idx:] + + if not spell.unknown([word1, word2]): + return True + + return False + + + # Check the spelling of all the words we have found fir tgus fuke, + def spellCheck(self): + + num_values = len(self.values) + this_item = 0 + for v in self.values: + if should_exit: + exit(1) + + this_value += 1 + + # Ignore includes. + if v.endswith('.h'): + continue + + # Store original (as want to include for context in error report). + original = str(v) + + # Replace most punctuation with spaces, and eliminate common format specifiers. + v = v.replace('.', ' ') + v = v.replace(',', ' ') + v = v.replace('`', ' ') + v = v.replace(':', ' ') + v = v.replace(';', ' ') + v = v.replace('"', ' ') + v = v.replace('\\', ' ') + v = v.replace('+', ' ') + v = v.replace('|', ' ') + v = v.replace('(', ' ') + v = v.replace(')', ' ') + v = v.replace('[', ' ') + v = v.replace(']', ' ') + v = v.replace('{', ' ') + v = v.replace('}', ' ') + v = v.replace('<', ' ') + v = v.replace('>', ' ') + v = v.replace('_', ' ') + v = v.replace('-', ' ') + v = v.replace('/', ' ') + v = v.replace('!', ' ') + v = v.replace('?', ' ') + v = v.replace('=', ' ') + v = v.replace('*', ' ') + v = v.replace('%', ' ') + v = v.replace('#', ' ') + v = v.replace('&', ' ') + v = v.replace('@', ' ') + v = v.replace("'", ' ') + v = v.replace('"', ' ') + v = v.replace('%u', '') + v = v.replace('%d', '') + v = v.replace('%s', '') + + # Split into words. + value_words = v.split() + # Further split up any camelCase words. + words = [] + for w in value_words: + words += camelCaseSplit(w) + + # Check each word within this string in turn. + for word in words: + # Strip trailing digits from word. + word = word.rstrip('1234567890') + + # Quote marks found in some of the docs... + word = word.replace('“', '') + word = word.replace('”', '') + + if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word): + print(self.file, this_value, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC, + ' -> ', '?') + # TODO: this can be interesting, but takes too long! + # bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC + global missing_words + missing_words.append(word) + +def removeComments(code_string): + code_string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,code_string) # C-style comment + # Remove this for now as can get tripped up if see htpps://www.... within a string! + #code_string = re.sub(re.compile("//.*?\n" ) ,"" ,code_string) # C++-style comment + return code_string + +def removeSingleQuotes(code_string): + code_string = code_string.replace('\"\\\\\"', "") + code_string = code_string.replace("\\\"", " ") + code_string = code_string.replace("'\"'", "") + return code_string + +def removeHexSpecifiers(code_string): + # TODO: replace with single regexp? + code_string = code_string.replace('0x%02X', "") + code_string = code_string.replace('0x%02x', "") + code_string = code_string.replace('0x%04X', "") + code_string = code_string.replace('0x%04x', "") + code_string = code_string.replace('0x%08X', "") + code_string = code_string.replace('0x%08x', "") + return code_string + + +# Create a File object that knows about all of the strings in the given file. +def findStrings(filename): + with open(filename, 'r') as f: + contents = f.read() + + # Remove comments & embedded quotes so as not to trip up RE. + contents = removeComments(contents) + contents = removeSingleQuotes(contents) + contents = removeHexSpecifiers(contents) + + # Create file object. + file = File(filename) + + # What we check depends upon file type. + if file.code_file: + # Code so only checking strings. + matches = re.finditer(r'\"([^\"]*)\"', contents) + for m in matches: + file.add(m.group(1)) + else: + # A documentation file, so examine all words. + words = contents.split() + for w in words: + file.add(w) + + return file + + +# Test for whether the given file was automatically generated. +def isGeneratedFile(filename): + # Open file + f_read = open(os.path.join(filename), 'r') + lines_tested = 0 + for line in f_read: + # The comment to say that its generated is near the top, so give up once + # get a few lines down. + if lines_tested > 10: + f_read.close() + return False + if (line.find('Generated automatically') != -1 or + line.find('Autogenerated from') != -1 or + line.find('is autogenerated') != -1 or + line.find('automatically generated by Pidl') != -1 or + line.find('Created by: The Qt Meta Object Compiler') != -1): + + f_read.close() + return True + lines_tested = lines_tested + 1 + + # OK, looks like a hand-written file! + f_read.close() + return False + + +def isAppropriateFile(filename): + file, extension = os.path.splitext(filename) + return extension in { '.adoc', '.c', '.cpp', '.pod'} or file.endswith('README') + + +def findFilesInFolder(folder): + files_to_check = [] + + for root, subfolders, files in os.walk(folder): + for f in files: + if should_exit: + return + + f = os.path.join(root, f) + if isAppropriateFile(f) and not isGeneratedFile(f): + files_to_check.append(f) + + return files_to_check + + +# Check the given dissector file. +def checkFile(filename): + file = findStrings(filename) + file.spellCheck() + + + +################################################################# +# Main logic. + +# command-line args. Controls which files should be checked. +# If no args given, will just scan epan/dissectors folder. +parser = argparse.ArgumentParser(description='Check calls in dissectors') +parser.add_argument('--file', action='store', default='', + help='specify individual dissector file to test') +parser.add_argument('--folder', action='store', default='', + help='specify folder to test') +parser.add_argument('--commits', action='store', + help='last N commits to check') +parser.add_argument('--open', action='store_true', + help='check open files') + +args = parser.parse_args() + + +# Get files from wherever command-line args indicate. +files = [] +if args.file: + # Add single specified file.. + if not os.path.isfile(args.file): + print('Chosen file', args.file, 'does not exist.') + exit(1) + else: + files.append(args.file) +elif args.commits: + # Get files affected by specified number of commits. + command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits] + files = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + # Will examine dissector files only + files = list(filter(lambda f : isAppropriateFile(f), files)) +elif args.open: + # Unstaged changes. + command = ['git', 'diff', '--name-only'] + files = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + # Only interested in dissector files. + files = list(filter(lambda f : isDissectorFile(f), files)) + # Staged changes. + command = ['git', 'diff', '--staged', '--name-only'] + files_staged = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + # Only interested in dissector files. + files_staged = list(filter(lambda f : isDissectorFile(f), files_staged)) + for f in files: + files.append(f) + for f in files_staged: + if not f in files: + files.append(f) +else: + # By default, scan dissectors + folder = os.path.join('epan', 'dissectors') + # But overwrite with any folder entry. + if args.folder: + folder = args.folder + if not os.path.isdir(folder): + print('Folder', folder, 'not found!') + exit(1) + + # Find files from folder. + print('Looking for files in', folder) + files = findFilesInFolder(folder) + + +# If scanning a subset of files, list them here. +print('Examining:') +if args.file or args.commits or args.open: + if files: + print(' '.join(files), '\n') + else: + print('No files to check.\n') +else: + print('All dissector modules\n') + + +# Now check the chosen files. +for f in files: + # Jump out if control-C has been pressed. + if should_exit: + exit(1) + checkFile(f) + + + +# Show the most commonly not-recognised words. TODO: depend upon a command-line option here? +print('') +counter = Counter(missing_words).most_common(100) +if len(counter) > 0: + for c in counter: + print(c[0], ':', c[1]) + +# Show error count. +print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n') \ No newline at end of file diff --git a/tools/wireshark_words.txt b/tools/wireshark_words.txt new file mode 100644 index 0000000000..9115371900 --- /dev/null +++ b/tools/wireshark_words.txt @@ -0,0 +1,872 @@ + +0x%02x +0x%08x +1xrtt +3gpp2 +80211n +accelerometer +accessors +acknowledgement +acp133 +actuator +adwin +aes128 +aes256 +aggregator +agnss +aironet +airpcap +airtel +alcap +alljoyn +alloc +allocators +amperage +analyzers +analyzes +annexc +appdata +appid +arfcn +asn1cnf +asn2wrs +assymetric +async +asynchronously +atheros +atomically +attrib +attrs +authenticates +authenticator +authtoken +authtype +autoconfiguration +autodiscovery +available +avaya +backhaul +backoff +bacnet +bcast +beamformed +beamformee +beamformer +beamforming +bitfield +bitmask +bitrate +bitstring +blackhole +bnode +bootfile +bootloader +bootp +broadcom +bsmap +bssid +bssids +bssmap +btatt +btcommon +bthci +btmesh +btsdp +btsnoop +byte +byteorder +cablelabs +callback +callid +callsign +canceled +canceling +cancelled +cannot +canonicalized +capinfos +capsa +capwap +carrierfreq +carrierid +cccid +ccpch +cctrch +cdma2000 +celcius +cellid +cellidentity +chan1 +chan2 +channelisation +charset +charsets +checkbox +checkout +chocolatey +chunked +ciphered +ciphering +ciphersuite +ciphertext +citrix +classmark +classmark3 +cmake +cmdcontrol +codebook +codepoint +codeset +codingrate +coloring +colorise +colorization +colorize +colorized +colorizing +combiner +concatenate +concatenated +concatenates +concurrent +configitem +conformant +connectionless +connid +const +contactless +contiguously +copyfile +couchbase +cpdlc +cpich +cpuregisters +credential +credentials +criticalextensions +criticalextensionsfuture +crnti +crypto +cryptographic +csapi +ctype +customizable +customizing +datagram +datagrams +dataitem +datarate +datastate +datetime +dcerpc +deact +deactivated +deactivating +deactivation +deassertion +deauth +deauthenticated +deauthentication +debian +debug +dechunk +decompressing +decompressor +decremented +decrementing +decrypt +decrypted +decrypting +decryption +defragment +defragmentation +defragmented +defragmenting +dehumidification +delimiters +demultiplexer +demultiplexers +deprecated +deregister +deregistered +deregistering +des40 +descr +desegment +desegmentation +desegmenting +deselect +devmode +dfilter +dfsauth +dhcpv +diffie +diplexer +directionality +dissection +dissector +dissectors +distinguisher +diversifier +dlmap +dlsch +dmepi +docsis +doesn't +double +downlink +dpauxmon +dpnss +drbid +dsmcc +dstport +dumpcap +earfcn +ebcdic +ecdhe +ecdsa +editcap +egprs +eigrp +elink +ellipsoid +encap +encaps +encapsulations +enciphered +encrypt +encrypting +endian +endianness +entryid +enumerations +epasv +errorcode +errored +errorportinfo +erspan +etheraddr +ethertype +ettarr +etype +eutra +eutran +extattr +extcap +extensibility +extrainformation +failover +fiber +fileset +firewall +flag1 +flag2 +flavored +flowid +flowmod +flowspec +format0 +fortigate +fortinet +fpiur +framenum +framenumber +framenun +frametype +fsctl +functionalities +funkt +fvalue +ganss +gboolean +gchar +gcrypt +gendc +geoip +geonw +geran +getattr +getnext +gigamon +github +gitlab +gluster +gmprs +goaway +google +gprscdr +groupa +groupb +groupcast +groupmod +guint +handoff +hangup +harqid +hartip +hashed +hazelcast +heuristic +hfarr +HI2Operations +hnbap +homeplug +hopcount +hostname +hsdpa +hsdsch +hspdsch +http2 +https +icmpv +ident +idl2wrs +iec60870 +ieee17221 +ieee80211 +iface +ifconfig +ikev2 +illuminance +implementor +incits +incrementing +infile +infiniband +infolist +informationitem +informationlist +initialise +initialising +initialization +initialize +initialized +initializer +initializers +initializes +initializing +inline +interleaving +interruptible +interworking +invalidation +ioctl +ipaddr +ipaddress +ipfix +ipprim +ipsec +iptrace +ipv4addr +isobus +iterator +itunes +iwarp +jetds +kademlia +keepalive +kerberos +keylen +keylog +keypress +keyring +keytab +knxip +l2cap +lanalyzer +lcgid +lcids +leasequery +libgcrypt +libpcap +linkaddr +linkinfo +linux +list1 +lithionics +logcat +loghans +loglocal +logoff +logout +loopback +lscap +lucent +luminance +macaddr +macaddress +mailto +malloc +mcast +megaco +mellanox +memcache +menubar +mergecap +messageid +metadata +meteorological +microbit +midamble +miniport +minislot +minislots +minus1 +mirrorlink +misconfiguration +misconfigured +mode01 +mode7 +modepage +modespecificinfo +mpeg4 +mpsse +mrcpv +msgsend +mtftp +mtrace +multiband +multicarrier +multicast +multicasted +multicore +multiframe +multiframes +multihop +multilateration +multipacket +multipart +multipath +multiplexed +multiplexer +multiplexers +multiplexing +multirat +multirate +multislot +multistate +nacks +namelen +namespace +narrowband +nbrar +netboot +netfilter +netflow +nethop +netlink +netlogon +netmask +netmon +netscaler +nettl +newpw +nexthop +nfs4err +ngsniffer +niagra +nonblock +noncriticalextension +noncriticalextensions +notif +notifier +notused +npcap +nprach +nsapi +nstime +nstrace +objectid +objkey +obsoleted +octets +octetstring +ofdma +offloadability +ofpat +ofppf +ofpxmt +om2000 +onduration +onoff +ontime +opcode +openvpn +opnum +optimizations +ospf6 +outhdr +packetcable +packetization +packetized +param +parameterization +parameterized +params +parlay +parms +passcode +passkey +passthrough +passwd +pcapng +pcell +pcmax +pcmaxc +pdcch +pdsch +peeraddr +phich +phonebook +physcellid +picmg +pinfo +plaintext +plugin +plugins +pname +polestar +popup +portcounters +portinfo +portmod +portnumber +portstatus +powercontrol +prach +preconfiguration +preconfigured +preempting +preemption +prefs +preloaded +prepay +prepend +preshared +prioritized +privkey +procid +profidrive +profinet +protected +protoabbrev +protobuf +protocolie +pscell +pseudowire +ptvcursor +pubdir +pubkey +pucch +pusch +pytest +qam16 +qam64 +qnet6 +radiotap +ranap +randomizer +randpkt +reachability +readme +realloc +realtime +reassigning +reauth +reauthentication +reauthorize +rebinding +recalculate +recalculating +recognizer +reconf +reconfig +reconfigure +reconfigured +reconfrqst +redelivery +redistributable +redistributables +reencyption +reestablishment +referer +referrer +regex +reimplemented +reinitialize +reinitializing +rekey +rekeying +reoptimization +reordercap +reorigination +representable +reprogrammable +reprogramming +requester +requestor +rerouting +resend +reservable +reserved +reserved0 +reserved1 +reserved2 +reserved3 +reserved4 +reserved5 +resize +resolver +resynchronization +retrans +retransmission +retransmissions +retransmit +retransmits +retransmitted +retries +retry +retyping +rfcomm +rlcmac +rnsap +roamer +routable +rpcap +rtpmidi +sanitize +satisfiable +scaler +scannable +scell +scoped +scrollbar +segno +semiautomatic +seqno +seqnum +sequenceno +serialize +serialized +sessionid +setattr +setuid +severities +sflow +sha256 +sha384 +sha512 +sharkd +shouldn't +siapp +sidelink +signaling +slsch +sname +snaplen +snow3g +someip +spare +spare1 +spare2 +spare3 +spare4 +spare5 +spare6 +spare7 +spare8 +spare9 +spcell +spnego +spooled +srbid +srcport +ssupervisor +stateful +statusbar +streamid +struct +subaddress +subband +subcarrier +subcarriers +subchannel +subcode +subdevice +subdissector +subdissectors +subelem +subelement +subelements +subframes +subheader +subheaders +subids +subindex +subm +submode +subnet +subnets +subobj +subobject +suboption +suboptions +subparam +subpdu +subpm +subquery +subselect +subselection +subslot +subtlv +subtree +subtrees +switchinfo +synchronizing +synphasor +sysdig +sysex +sysframe +syslog +systemd +tablemod +tcpip +tcpudp +tdd128 +tdd384 +tdd768 +teredo +text2pcap +timeout +timeslot +timestamp +timestamps +timezone +toggling +toolongfragment +tooltip +touchlink +traceroute +transcoder +truncate +tshark +tspec +tunneled +tunneling +tvbuff +type1 +type2 +type3 +typedef +uarfcn +uboot +ubuntu +udpcp +uint16 +uint32 +uint8 +ulmap +ulsch +unaligned +unassign +unauthenticated +uncalculated +unciphered +uncompress +uncompressing +uncompression +unconfigurable +unconfigured +unconfirm +uncorrectable +undecipherable +undecodable +undecoded +undecryptable +undecrypted +undeliverable +underflow +underrun +undissected +unencrypted +unescaped +unfragmented +unhandled +unicast +unicode +unignore +unimplemented +uninitialized +uninstall +uninstaller +unknown1 +unlink +unmarshal +unparsable +unparsed +unpunctuated +unreassembled +unrecoverable +unrecovered +unregister +unregistration +unreportable +unresponded +unroutable +unsecure +unsegmented +unsequenced +unsubscribe +unsynchronized +untagged +untruncated +untrusted +untunelled +uplink +upload +uploaded +uploading +urlencoded +urnti +userdata +userinfo +userlist +userplane +utilization +utran +v1250 +v1310 +v1410 +v1530 +v1610 +verizon +version2 +version3 +version4 +version5 +version6 +version7 +versioning +virtualization +volerr +wakeup +webcam +websocket +wideband +wikipedia +wimax +winpcap +winspool +wireshark +wiretap +withfcs +withoutfcs +wksta +writable +wslua +wsluarm +x509sat +xchannel +xmlns +z3950 +zigbee -- cgit v1.2.3