华为OD机试:中文分词模拟器
题目描述
给定一个连续不包含空格的字符串,该字符串仅包含英文小写字母及英文标点符号(逗号、分号、句号),同时给定词库,对该字符串进行精确分词。
说明:
输入描述
第一行输入待分词语句 "ilovechina"
第二行输入中文词库 "i,love,china,ch,na,ve,lo,this,is,this,word"
输出描述
按顺序输出分词结果 "i,love,china"
用例
题目解析
1.
将输入的待分词语句和词库分别存储到变量中。
2.
创建一个空列表用于存储分词结果。
3.
从待分词语句的第一个字符开始,逐个字符向后匹配,直到找到最长匹配的词或无法继续匹配为止。
4.
如果找到最长匹配的词,则将其添加到分词结果列表中,并将匹配位置后移一位。
5.
重复步骤3和4,直到遍历完整个待分词语句。
6.
输出分词结果列表。
const rl = require("readline").createInterface({ input: process.stdin }); var iter = rl[Symbol.asyncIterator](); const readline = async () => (await iter.next()).value; // 输入输出处理 void (async function () { const sentences = (await readline()).split(/[,.;]/); const words = (await readline()).split(/[,.;]/); console.log(getResult(sentences, words)); })(); function getResult(sentences, words) { // wordMap 记录词库词汇 const wordMap = new Map(words.map((word, i) => [word, i])); // ans记录最终分词结果 const ans = []; while (sentences.length > 0) { const sentence = sentences.shift(); let r = sentence.length; for (; r > 0; r--) { // 截取句子 [0,r) 范围子串词汇, 这样的就能实现优先最长匹配,并且由于是必须从0索引开始截取,因此满足了分词顺序优先 const fragment = sentence.slice(0, r); // 若词库中是否存在该子串词汇 if (wordMap.has(fragment)) { // 则将对应子串词汇纳入结果 ans.push(fragment); wordMap.delete(fragment); // 我理解词库中每个词汇只能使用一次,因此这里将词库中使用过的词汇移除 // 若子串词汇只是句子部分,则句子剩余部分还要继续去词库中查找 if (r < sentence.length) { sentences.unshift(sentence.slice(r)); } break; } } // 没有在词库中找到对应子串词汇,则输出单个字母 if (r == 0) { // 注意,这里只放一个字母到结果中,句子剩余部分继续去词库中查找 ans.push(sentence[0]); if (sentence.length > 1) { sentences.unshift(sentence.slice(1)); } } } return ans.join(","); }
import java.util.*; public class Main { public static void main(String[] args) { Scanner sc = new Scanner(System.in); String[] sentences = sc.nextLine().split("[,.;]"); String[] words = sc.nextLine().split("[,.;]"); System.out.println(getResult(sentences, words)); } public static String getResult(String[] sentences, String[] words) { HashSet<String> wordSet = new HashSet<>(Arrays.asList(words)); LinkedList<String> queue = new LinkedList<>(Arrays.asList(sentences)); LinkedList<String> ans = new LinkedList<>(); while (!queue.isEmpty()) { String sentence = queue.removeFirst(); int r = sentence.length(); for (; r > 0; r--) { String fragment = sentence.substring(0, r); if (wordSet.contains(fragment)) { ans.addLast(fragment); wordSet.remove(fragment); if (r < sentence.length()) { queue.addFirst(sentence.substring(r)); } break; } } if (r == 0) { ans.add(sentence.charAt(0) + ""); if (sentence.length() > 1) { queue.addFirst(sentence.substring(1)); } } } return String.join(",", ans); } }
import re def get_result(sentences, words): word_set = set(words) ans = [] for sentence in sentences: r = len(sentence) while r > 0: fragment = sentence[:r] if fragment in word_set: ans.append(fragment) word_set.remove(fragment) if r < len(sentence): sentence = sentence[r:] r = len(sentence) else: break else: r -= 1 if r == 0: ans.append(sentence[0]) if len(sentence) > 1: sentence = sentence[1:] r = len(sentence) return ",".join(ans) sentences = re.split(r"[,.;]", input()) words = re.split(r"[,.;]", input()) print(get_result(sentences, words))
#include <stdio.h> #include <stdlib.h> #include <string.h> #define MAX_LENGTH 256 #define MAX_WORDS_LENGTH 100000 #define delimiter ",.;" #define TRUE 1 #define FALSE 0 typedef struct ListNode { char *val; struct ListNode *prev; struct ListNode *next; } ListNode; ListNode *new_ListNode(char *val) { ListNode *node = (ListNode *) malloc(sizeof(ListNode)); node->val = (char *) calloc(MAX_LENGTH, sizeof(char)); strcpy(node->val, val); node->prev = NULL; node->next = NULL; return node; } typedef struct LinkedList { int size; ListNode *head; ListNode *tail; } LinkedList; LinkedList *new_LinkedList() { LinkedList *link = (LinkedList *) malloc(sizeof(LinkedList)); link->size = 0; link->head = NULL; link->tail = NULL; return link; } void addFirst_LinkedList(LinkedList *link, char *val) { ListNode *node = new_ListNode(val); if (link->size == 0) { link->head = node; link->tail = node; } else { node->next = link->head; link->head->prev = node; link->head = node; } link->size++; } void addLast_LinkedList(LinkedList *link, char *val) { ListNode *node = new_ListNode(val); if (link->size == 0) { link->head = node; link->tail = node; } else { link->tail->next = node; node->prev = link->tail; link->tail = node; } link->size++; } char *removeFirst_LinkedList(LinkedList *link) { if (link->size == 0) exit(-1); ListNode *removed = link->head; if (link->size == 1) { link->head = NULL; link->tail = NULL; } else { link->head = link->head->next; link->head->prev = NULL; } link->size--; char *res = (char *) calloc(MAX_LENGTH, sizeof(char)); strcpy(res, removed->val); free(removed); return res; } char *removeLast_LinkedList(LinkedList *link) { if (link->size == 0) exit(-1); ListNode *removed = link->tail; if (link->size == 1) { link->head = NULL; link->tail = NULL; } else { link->tail = link->tail->prev; link->tail->next = NULL; } link->size--; char *res = (char *) calloc(MAX_LENGTH, sizeof(char)); strcpy(res, removed->val); free(removed); return res; } char *substr(char *s, int l, int r) { int len = r - l; char *res = (char *) calloc(len + 1, sizeof(char)); strncpy(res, s + l, len); return res; } int main() { char s1[MAX_LENGTH]; gets(s1); LinkedList *sentences = new_LinkedList(); char *token1 = strtok(s1, delimiter); while (token1 != NULL) { addLast_LinkedList(sentences, token1); token1 = strtok(NULL, delimiter); } char s2[MAX_WORDS_LENGTH]; gets(s2); LinkedList *words = new_LinkedList(); char *token2 = strtok(s2, delimiter); while (token2 != NULL) { addLast_LinkedList(words, token2); token2 = strtok(NULL, delimiter); } LinkedList *ans = new_LinkedList(); while (sentences->size > 0) { char *sentence = removeFirst_LinkedList(sentences); int len = (int) strlen(sentence); int r = len; for (; r > 0; r--) { char *fragment = substr(sentence, 0, r); int find = FALSE; ListNode *cur = words->head; while (cur != NULL) { if (strcmp(cur->val, fragment) == 0) { addLast_LinkedList(ans, fragment); strcpy(cur->val, ""); if (r < len) { addFirst_LinkedList(sentences, substr(sentence, r, len)); } find = TRUE; break; } cur = cur->next; } if (find) { break; } } if (r == 0) { char tmp[] = {sentence[0], '\0'}; addLast_LinkedList(ans, tmp); if (len > 1) { addFirst_LinkedList(sentences, substr(sentence, 1, len)); } } } ListNode *cur = ans->head; while (cur != NULL) { printf("%s", cur->val); cur = cur->next; if (cur != NULL) { printf(","); } } return 0; }