From d726f1e1df65a3c9db32d5d0ce9acea1ca094a07 Mon Sep 17 00:00:00 2001 From: sw5678 <151949597+sw5678@users.noreply.github.com> Date: Mon, 12 May 2025 14:03:07 +0100 Subject: [PATCH 1/5] Added word count operation --- src/core/config/Categories.json | 3 +- src/core/operations/WordCount.mjs | 116 ++++++++++++++++++++++++++ tests/operations/index.mjs | 1 + tests/operations/tests/WordCount.mjs | 117 +++++++++++++++++++++++++++ 4 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 src/core/operations/WordCount.mjs create mode 100644 tests/operations/tests/WordCount.mjs diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 7f9591e0e..0226e33bb 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -337,7 +337,8 @@ "Sleep", "File Tree", "Take nth bytes", - "Drop nth bytes" + "Drop nth bytes", + "Word Count" ] }, { diff --git a/src/core/operations/WordCount.mjs b/src/core/operations/WordCount.mjs new file mode 100644 index 000000000..65260e536 --- /dev/null +++ b/src/core/operations/WordCount.mjs @@ -0,0 +1,116 @@ +/** + * @author sw5678 + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import Utils from "../Utils.mjs"; +import {LETTER_DELIM_OPTIONS} from "../lib/Delim.mjs"; +import {caseInsensitiveSort} from "../lib/Sort.mjs"; + + +/** + * Word Count operation + */ +class WordCount extends Operation { + + /** + * Word Count constructor + */ + constructor() { + super(); + + this.name = "Word Count"; + this.module = "Default"; + this.description = "Provides a count of each word in a given text"; + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + name: "Delimiter", + type: "option", + value: LETTER_DELIM_OPTIONS + }, + { + "name": "Include Total", + "type": "boolean", + "value": true + }, + { + "name": "Order", + "type": "option", + "value": ["Alphabetical", "Count"] + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + run(input, args) { + + const delimiter = Utils.charRep(args[0]); + + // Lower case and split + const inputArray = input.replace(/(?:\r\n|\r|\n)/g, delimiter).toLowerCase().split(delimiter); + + // Count up the words + const counter = {}; + let total = 0; + for (let j = 0; j < inputArray.length; j++) { + + // Trim whitespace and replace punctuation + const word = inputArray[j].replace(/(?:!|"|#|\$|%|&|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|\{|\||\}|~|£)/g, "").trim(); + + // If empty string or ', then skip + if (word === "" || /[']+/.test(word)) { + continue; + } else if (word in counter) { + counter[word]++; + total++; + } else { + counter[word] = 1; + total++; + } + } + + // Sort results + let order; + if (args[2] === "Alphabetical") { + // Sort alphabetically + order = Object.keys(counter).sort(caseInsensitiveSort); + } else if (args[2] === "Count") { + // Sort by count + // Create the array of key-value pairs + order = Object.keys(counter).map((key) => { + return [key, counter[key]]; + }); + // Sort the array based on the second element (i.e. the value) + order.sort((first, second) => { + return second[1] - first[1]; + }); + // Obtain the list of keys in sorted order of the values. + order = order.map((e) => { + return e[0]; + }); + } + + // Process output to string + let output = "WORD,COUNT\n"; + for (let k = 0; k < order.length; k++) { + output = output + order[k] + "," + counter[order[k]] + "\n"; + } + + // Add total counter at the bottom + if (args[1]) { + output = output + "TOTAL," + total; + } + + return output; + } +} + +export default WordCount; \ No newline at end of file diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index ab1ceb8fe..d0b5820d4 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -170,6 +170,7 @@ import "./tests/JA3Fingerprint.mjs"; import "./tests/JA3SFingerprint.mjs"; import "./tests/HASSH.mjs"; import "./tests/JSONtoYAML.mjs"; +import "./tests/WordCount.mjs"; // Cannot test operations that use the File type yet // import "./tests/SplitColourChannels.mjs"; diff --git a/tests/operations/tests/WordCount.mjs b/tests/operations/tests/WordCount.mjs new file mode 100644 index 000000000..41f670480 --- /dev/null +++ b/tests/operations/tests/WordCount.mjs @@ -0,0 +1,117 @@ +/** + * @author sw5678 + * @copyright Crown Copyright 2023 + * @license Apache-2.0 + */ +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + "name": "Word Count: Empty test 1", + "input": "", + "expectedOutput": "WORD,COUNT\nTOTAL,0", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", true, "Alphabetical"], + }, + ], + }, + { + "name": "Word Count: Empty test 2", + "input": "", + "expectedOutput": "WORD,COUNT\nTOTAL,0", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", true, "Count"], + }, + ], + }, + { + "name": "Word Count: Empty test 3", + "input": "", + "expectedOutput": "WORD,COUNT\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", false, "Alphabetical"], + }, + ], + }, + { + "name": "Word Count: Empty test 4", + "input": "", + "expectedOutput": "WORD,COUNT\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", false, "Count"], + }, + ], + }, + { + "name": "Word Count: Count test 1", + "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nhello,2\nworld,3\nTOTAL,5", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", true, "Alphabetical"], + }, + ], + }, + { + "name": "Word Count: Count test 2", + "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nworld,3\nhello,2\nTOTAL,5", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", true, "Count"], + }, + ], + }, + { + "name": "Word Count: Count test 3", + "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nhello,2\nworld,3\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", false, "Alphabetical"], + }, + ], + }, + { + "name": "Word Count: Count test 4", + "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nworld,3\nhello,2\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", false, "Count"], + }, + ], + }, + { + "name": "Word Count: Different delimiter test", + "input": "Hello, World\nhello, world \n''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nworld,3\nhello,2\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Comma", false, "Count"], + }, + ], + } +]); \ No newline at end of file From cbf8a7773ce741af8af71eddfb0191676b74896a Mon Sep 17 00:00:00 2001 From: sw5678 <151949597+sw5678@users.noreply.github.com> Date: Mon, 12 May 2025 14:06:57 +0100 Subject: [PATCH 2/5] Fixed linting errors --- src/core/operations/WordCount.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/operations/WordCount.mjs b/src/core/operations/WordCount.mjs index 65260e536..a94eb0ac9 100644 --- a/src/core/operations/WordCount.mjs +++ b/src/core/operations/WordCount.mjs @@ -113,4 +113,4 @@ class WordCount extends Operation { } } -export default WordCount; \ No newline at end of file +export default WordCount; From 3366fe3b026b6e4ab248f67cd7bc6c555efd71ae Mon Sep 17 00:00:00 2001 From: sw5678 <151949597+sw5678@users.noreply.github.com> Date: Mon, 12 May 2025 14:09:24 +0100 Subject: [PATCH 3/5] Fixed linting errors --- tests/operations/tests/WordCount.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/operations/tests/WordCount.mjs b/tests/operations/tests/WordCount.mjs index 41f670480..bc518de2d 100644 --- a/tests/operations/tests/WordCount.mjs +++ b/tests/operations/tests/WordCount.mjs @@ -114,4 +114,4 @@ TestRegister.addTests([ }, ], } -]); \ No newline at end of file +]); From c2dce58db297354e371734d13006eb3c5a24b019 Mon Sep 17 00:00:00 2001 From: sw5678 <151949597+sw5678@users.noreply.github.com> Date: Thu, 31 Jul 2025 13:42:27 +0100 Subject: [PATCH 4/5] Made word count slightly better --- src/core/operations/WordCount.mjs | 10 +++++----- tests/operations/tests/WordCount.mjs | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/core/operations/WordCount.mjs b/src/core/operations/WordCount.mjs index a94eb0ac9..a34e88f90 100644 --- a/src/core/operations/WordCount.mjs +++ b/src/core/operations/WordCount.mjs @@ -63,7 +63,9 @@ class WordCount extends Operation { for (let j = 0; j < inputArray.length; j++) { // Trim whitespace and replace punctuation - const word = inputArray[j].replace(/(?:!|"|#|\$|%|&|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|\{|\||\}|~|£)/g, "").trim(); + const word = inputArray[j].replace(/[!"#\$%&\(\)\*\+,-\.\/:;<=>\?@\[\\\]\^_`\{\\}~£\|]/g, "").trim(); + + // If empty string or ', then skip if (word === "" || /[']+/.test(word)) { @@ -100,13 +102,11 @@ class WordCount extends Operation { // Process output to string let output = "WORD,COUNT\n"; - for (let k = 0; k < order.length; k++) { - output = output + order[k] + "," + counter[order[k]] + "\n"; - } + output = output + order.map(entry => `${entry},${counter[entry]}`).join('\n'); // Add total counter at the bottom if (args[1]) { - output = output + "TOTAL," + total; + output = output + "\nTOTAL," + total; } return output; diff --git a/tests/operations/tests/WordCount.mjs b/tests/operations/tests/WordCount.mjs index bc518de2d..4d210260e 100644 --- a/tests/operations/tests/WordCount.mjs +++ b/tests/operations/tests/WordCount.mjs @@ -9,7 +9,7 @@ TestRegister.addTests([ { "name": "Word Count: Empty test 1", "input": "", - "expectedOutput": "WORD,COUNT\nTOTAL,0", + "expectedOutput": "WORD,COUNT\n\nTOTAL,0", "recipeConfig": [ { @@ -21,7 +21,7 @@ TestRegister.addTests([ { "name": "Word Count: Empty test 2", "input": "", - "expectedOutput": "WORD,COUNT\nTOTAL,0", + "expectedOutput": "WORD,COUNT\n\nTOTAL,0", "recipeConfig": [ { @@ -81,7 +81,7 @@ TestRegister.addTests([ { "name": "Word Count: Count test 3", "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", - "expectedOutput": "WORD,COUNT\nhello,2\nworld,3\n", + "expectedOutput": "WORD,COUNT\nhello,2\nworld,3", "recipeConfig": [ { @@ -93,7 +93,7 @@ TestRegister.addTests([ { "name": "Word Count: Count test 4", "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", - "expectedOutput": "WORD,COUNT\nworld,3\nhello,2\n", + "expectedOutput": "WORD,COUNT\nworld,3\nhello,2", "recipeConfig": [ { @@ -105,7 +105,7 @@ TestRegister.addTests([ { "name": "Word Count: Different delimiter test", "input": "Hello, World\nhello, world \n''!@£$%^&*()_+=-[]{};'|:/.,<>? world", - "expectedOutput": "WORD,COUNT\nworld,3\nhello,2\n", + "expectedOutput": "WORD,COUNT\nworld,3\nhello,2", "recipeConfig": [ { From 27e4d7e3bb1def332f9ef3ae60eba89633adcf40 Mon Sep 17 00:00:00 2001 From: sw5678 <151949597+sw5678@users.noreply.github.com> Date: Thu, 31 Jul 2025 14:03:43 +0100 Subject: [PATCH 5/5] Fixing linting issues --- src/core/operations/WordCount.mjs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/core/operations/WordCount.mjs b/src/core/operations/WordCount.mjs index a34e88f90..98fc2d82f 100644 --- a/src/core/operations/WordCount.mjs +++ b/src/core/operations/WordCount.mjs @@ -63,9 +63,7 @@ class WordCount extends Operation { for (let j = 0; j < inputArray.length; j++) { // Trim whitespace and replace punctuation - const word = inputArray[j].replace(/[!"#\$%&\(\)\*\+,-\.\/:;<=>\?@\[\\\]\^_`\{\\}~£\|]/g, "").trim(); - - + const word = inputArray[j].replace(/[!"#$%&()*+,-./:;<=>?@[\\\]^_`{}~£|]/g, "").trim(); // If empty string or ', then skip if (word === "" || /[']+/.test(word)) { @@ -102,7 +100,7 @@ class WordCount extends Operation { // Process output to string let output = "WORD,COUNT\n"; - output = output + order.map(entry => `${entry},${counter[entry]}`).join('\n'); + output = output + order.map(entry => `${entry},${counter[entry]}`).join("\n"); // Add total counter at the bottom if (args[1]) {