华为OD机试:中文分词模拟器

题目描述

给定一个连续不包含空格的字符串,该字符串仅包含英文小写字母及英文标点符号(逗号、分号、句号),同时给定词库,对该字符串进行精确分词。

说明:

输入描述

第一行输入待分词语句 "ilovechina"

第二行输入中文词库 "i,love,china,ch,na,ve,lo,this,is,this,word"

输出描述

按顺序输出分词结果 "i,love,china"

用例

题目解析

1.

将输入的待分词语句和词库分别存储到变量中。

2.

创建一个空列表用于存储分词结果。

3.

从待分词语句的第一个字符开始,逐个字符向后匹配,直到找到最长匹配的词或无法继续匹配为止。

4.

如果找到最长匹配的词,则将其添加到分词结果列表中,并将匹配位置后移一位。

5.

重复步骤3和4,直到遍历完整个待分词语句。

6.

输出分词结果列表。

const rl = require("readline").createInterface({ input: process.stdin });
var iter = rl[Symbol.asyncIterator]();
const readline = async () => (await iter.next()).value;

// 输入输出处理
void (async function () {
  const sentences = (await readline()).split(/[,.;]/);
  const words = (await readline()).split(/[,.;]/);

  console.log(getResult(sentences, words));
})();

function getResult(sentences, words) {
  // wordMap 记录词库词汇
  const wordMap = new Map(words.map((word, i) => [word, i]));

  // ans记录最终分词结果
  const ans = [];

  while (sentences.length > 0) {
    const sentence = sentences.shift();

    let r = sentence.length;
    for (; r > 0; r--) {
      // 截取句子 [0,r) 范围子串词汇, 这样的就能实现优先最长匹配,并且由于是必须从0索引开始截取,因此满足了分词顺序优先
      const fragment = sentence.slice(0, r);

      // 若词库中是否存在该子串词汇
      if (wordMap.has(fragment)) {
        // 则将对应子串词汇纳入结果
        ans.push(fragment);
        wordMap.delete(fragment); // 我理解词库中每个词汇只能使用一次,因此这里将词库中使用过的词汇移除

        // 若子串词汇只是句子部分,则句子剩余部分还要继续去词库中查找
        if (r < sentence.length) {
          sentences.unshift(sentence.slice(r));
        }

        break;
      }
    }

    // 没有在词库中找到对应子串词汇,则输出单个字母
    if (r == 0) {
      // 注意,这里只放一个字母到结果中,句子剩余部分继续去词库中查找
      ans.push(sentence[0]);

      if (sentence.length > 1) {
        sentences.unshift(sentence.slice(1));
      }
    }
  }

  return ans.join(",");
}

import java.util.*;

public class Main {
    public static void main(String[] args) {
        Scanner sc = new Scanner(System.in);

        String[] sentences = sc.nextLine().split("[,.;]");
        String[] words = sc.nextLine().split("[,.;]");

        System.out.println(getResult(sentences, words));
    }

    public static String getResult(String[] sentences, String[] words) {
        HashSet<String> wordSet = new HashSet<>(Arrays.asList(words));
        LinkedList<String> queue = new LinkedList<>(Arrays.asList(sentences));
        LinkedList<String> ans = new LinkedList<>();

        while (!queue.isEmpty()) {
            String sentence = queue.removeFirst();
            int r = sentence.length();
            for (; r > 0; r--) {
                String fragment = sentence.substring(0, r);
                if (wordSet.contains(fragment)) {
                    ans.addLast(fragment);
                    wordSet.remove(fragment);
                    if (r < sentence.length()) {
                        queue.addFirst(sentence.substring(r));
                    }
                    break;
                }
            }
            if (r == 0) {
                ans.add(sentence.charAt(0) + "");
                if (sentence.length() > 1) {
                    queue.addFirst(sentence.substring(1));
                }
            }
        }

        return String.join(",", ans);
    }
}
import re

def get_result(sentences, words):
    word_set = set(words)
    ans = []

    for sentence in sentences:
        r = len(sentence)
        while r > 0:
            fragment = sentence[:r]

            if fragment in word_set:
                ans.append(fragment)
                word_set.remove(fragment)
                if r < len(sentence):
                    sentence = sentence[r:]
                    r = len(sentence)
                else:
                    break
            else:
                r -= 1

        if r == 0:
            ans.append(sentence[0])
            if len(sentence) > 1:
                sentence = sentence[1:]
                r = len(sentence)

    return ",".join(ans)

sentences = re.split(r"[,.;]", input())
words = re.split(r"[,.;]", input())
print(get_result(sentences, words))

 #include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LENGTH 256
#define MAX_WORDS_LENGTH 100000
#define delimiter ",.;"
#define TRUE 1
#define FALSE 0

typedef struct ListNode {
    char *val;
    struct ListNode *prev;
    struct ListNode *next;
} ListNode;

ListNode *new_ListNode(char *val) {
    ListNode *node = (ListNode *) malloc(sizeof(ListNode));
    node->val = (char *) calloc(MAX_LENGTH, sizeof(char));
    strcpy(node->val, val);
    node->prev = NULL;
    node->next = NULL;
    return node;
}

typedef struct LinkedList {
    int size;
    ListNode *head;
    ListNode *tail;
} LinkedList;

LinkedList *new_LinkedList() {
    LinkedList *link = (LinkedList *) malloc(sizeof(LinkedList));
    link->size = 0;
    link->head = NULL;
    link->tail = NULL;
    return link;
}

void addFirst_LinkedList(LinkedList *link, char *val) {
    ListNode *node = new_ListNode(val);

    if (link->size == 0) {
        link->head = node;
        link->tail = node;
    } else {
        node->next = link->head;
        link->head->prev = node;
        link->head = node;
    }

    link->size++;
}

void addLast_LinkedList(LinkedList *link, char *val) {
    ListNode *node = new_ListNode(val);

    if (link->size == 0) {
        link->head = node;
        link->tail = node;
    } else {
        link->tail->next = node;
        node->prev = link->tail;
        link->tail = node;
    }

    link->size++;
}

char *removeFirst_LinkedList(LinkedList *link) {
    if (link->size == 0) exit(-1);

    ListNode *removed = link->head;

    if (link->size == 1) {
        link->head = NULL;
        link->tail = NULL;
    } else {
        link->head = link->head->next;
        link->head->prev = NULL;
    }

    link->size--;

    char *res = (char *) calloc(MAX_LENGTH, sizeof(char));
    strcpy(res, removed->val);

    free(removed);

    return res;
}

char *removeLast_LinkedList(LinkedList *link) {
    if (link->size == 0) exit(-1);

    ListNode *removed = link->tail;

    if (link->size == 1) {
        link->head = NULL;
        link->tail = NULL;
    } else {
        link->tail = link->tail->prev;
        link->tail->next = NULL;
    }

    link->size--;

    char *res = (char *) calloc(MAX_LENGTH, sizeof(char));
    strcpy(res, removed->val);

    free(removed);

    return res;
}

char *substr(char *s, int l, int r) {
    int len = r - l;
    char *res = (char *) calloc(len + 1, sizeof(char));
    strncpy(res, s + l, len);
    return res;
}

int main() {
    char s1[MAX_LENGTH];
    gets(s1);

    LinkedList *sentences = new_LinkedList();
    char *token1 = strtok(s1, delimiter);
    while (token1 != NULL) {
        addLast_LinkedList(sentences, token1);
        token1 = strtok(NULL, delimiter);
    }

    char s2[MAX_WORDS_LENGTH];
    gets(s2);

    LinkedList *words = new_LinkedList();
    char *token2 = strtok(s2, delimiter);
    while (token2 != NULL) {
        addLast_LinkedList(words, token2);
        token2 = strtok(NULL, delimiter);
    }

    LinkedList *ans = new_LinkedList();

    while (sentences->size > 0) {
        char *sentence = removeFirst_LinkedList(sentences);

        int len = (int) strlen(sentence);
        int r = len;

        for (; r > 0; r--) {
            char *fragment = substr(sentence, 0, r);

            int find = FALSE;

            ListNode *cur = words->head;
            while (cur != NULL) {
                if (strcmp(cur->val, fragment) == 0) {
                    addLast_LinkedList(ans, fragment);
                    strcpy(cur->val, "");

                    if (r < len) {
                        addFirst_LinkedList(sentences, substr(sentence, r, len));
                    }

                    find = TRUE;
                    break;
                }
                cur = cur->next;
            }

            if (find) {
                break;
            }
        }

        if (r == 0) {
            char tmp[] = {sentence[0], '\0'};
            addLast_LinkedList(ans, tmp);

            if (len > 1) {
                addFirst_LinkedList(sentences, substr(sentence, 1, len));
            }
        }
    }

    ListNode *cur = ans->head;

    while (cur != NULL) {
        printf("%s", cur->val);
        cur = cur->next;
        if (cur != NULL) {
            printf(",");
        }
    }

    return 0;
}

全部评论

相关推荐

1 1 评论
分享
牛客网
牛客企业服务