作业帮 > 综合 > 作业

用JAVA语言设计一个类,统计一篇英文文章的词频,并按照词频由高到低输出.修改下面代码就行了.

来源:学生作业帮 编辑:搜搜做题作业网作业帮 分类:综合作业 时间:2024/08/15 05:22:04
用JAVA语言设计一个类,统计一篇英文文章的词频,并按照词频由高到低输出.修改下面代码就行了.
public class Article {
//保存文章的内容
String content;
//保存分割后的单词集合
String[] rawWords;
//保存统计后的单词集合
String[] words;
//保存单词对应的词频
int[] wordFreqs;
//构造函数,输入文章内容
//提高部分:从文件中读取
public Article()
{
content="kolya is one of the richest films i've seen in some time .zdenek sverak plays a confirmed old bachelor ( who's likely to remain so ) ,who finds his life as a czech cellist increasingly impacted by the five-year old boy that he's taking care of .though it ends rather abruptly-- and i'm whining ,'cause i wanted to spend more time with these characters-- the acting ,writing ,and production values are as high as ,if not higher than ,comparable american dramas .this father-and-son delight-- sverak also wrote the script ,while his son ,jan ,directed-- won a golden globe for best foreign language film and ,a couple days after i saw it ,walked away an oscar .in czech and russian ,with english subtitles .";
}
//对文章根据分隔符进行分词,将结果保存到rawWords数组中
public void splitWord()
{
}
//统计词,遍历数组
public void countWordFreq()
{
}
//根据词频,将词数组和词频数组进行降序排序
public void sort()
{
}
//将排序结果输出
public void printResult()
{
}
public static void main(String[] args)
{
//测试类的功能
}
}
(2)在上面的基础上完成从文件夹中读取所有文章,输出每篇文章词频最高的10个词.
用JAVA语言设计一个类,统计一篇英文文章的词频,并按照词频由高到低输出.修改下面代码就行了.
这题目如果能增加一个类的话会高效很多.如果非要在这个框框里面,代码麻烦 效率低下呢.
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
public class Article {
//保存文章的内容
String content;
//保存分割后的单词集合
String[] rawWords;
//保存统计后的单词集合
String[] words;
//保存单词对应的词频
int[] wordFreqs;
//构造函数,输入文章内容
//提高部分:从文件中读取
public Article() {
content = "kolya is one of the richest films i've seen in some time . zdenek sverak plays a confirmed old bachelor ( who's likely to remain so ) , who finds his life as a czech cellist increasingly impacted by the five-year old boy that he's taking care of . though it ends rather abruptly-- and i'm whining , 'cause i wanted to spend more time with these characters-- the acting , writing , and production values are as high as , if not higher than , comparable american dramas . this father-and-son delight-- sverak also wrote the script , while his son , jan , directed-- won a golden globe for best foreign language film and , a couple days after i saw it , walked away an oscar . in czech and russian , with english subtitles . ";
}
//对文章根据分隔符进行分词,将结果保存到rawWords数组中
public void splitWord(){
//分词的时候,因为标点符号不参与,所以所有的符号全部替换为空格
final char SPACE = ' ';
content = content.replace('\'', SPACE).replace(',', SPACE).replace('.', SPACE);
content = content.replace('(', SPACE).replace(')', SPACE).replace('-', SPACE);

rawWords = content.split("\\s+");//凡是空格隔开的都算单词,上面替换了', 所以I've 被分成2个 //单词
}
//统计词,遍历数组
public void countWordFreq() {
//将所有出现的字符串放入唯一的set中,不用map,是因为map寻找效率太低了
Set set = new TreeSet();

for(String word: rawWords){
set.add(word);
}

Iterator ite = set.iterator();
List wordsList = new ArrayList();
List freqList = new ArrayList();
//多少个字符串未知,所以用list来保存先
while(ite.hasNext()){
String word = (String) ite.next();

int count = 0;//统计相同字符串的个数
for(String str: rawWords){
if(str.equals(word)){
count++;
}
}

wordsList.add(word);
freqList.add(count++);
}

//存入数组当中
words = wordsList.toArray(new String[0]);

wordFreqs = new int[freqList.size()];
for(int i = 0; i < freqList.size(); i++){
wordFreqs[i] = freqList.get(i);
}

}
//根据词频,将词数组和词频数组进行降序排序
public void sort() {

class Word{
private String word;
private int freq;

public Word(String word, int freq){
this.word = word;
this.freq = freq;
}
}
//注意:此处排序,1)首先按照词频降序排列, 2)如果词频相同,按照字母降序排列,
//如 'abc' > 'ab' >'aa'
class WordComparator implements Comparator{
public int compare(Object o1, Object o2) {
Word word1 = (Word) o1;
Word word2 = (Word) o2;

if(word1.freq < word2.freq){
return 1;
}else if(word1.freq > word2.freq){
return -1;
}else{

int len1 = word1.word.trim().length();
int len2 = word2.word.trim().length();

String min = len1 > len2? word2.word: word1.word;
String max = len1 > len2? word1.word: word2.word;

for(int i = 0; i < min.length(); i++){
if(min.charAt(i) < max.charAt(i)){
return 1;
}
}

return 1;

}
}

}

List wordList = new ArrayList();

for(int i = 0; i < words.length; i++){
wordList.add(new Word(words[i], wordFreqs[i]));
}

Collections.sort(wordList, new WordComparator());

for(int i = 0; i < wordList.size(); i++){
Word wor = (Word) wordList.get(i);

words[i] = wor.word;
wordFreqs[i] = wor.freq;
}

}
//将排序结果输出
public void printResult() {
System.out.println("Total " + words.length + " different words in the content!");
for(int i = 0; i < words.length; i++){
System.out.println(wordFreqs[i] + " " + words[i]);
}
}
//测试类的功能
public static void main(String[] args) {
Article a = new Article();
a.splitWord();
a.countWordFreq();
a.sort();
a.printResult();
}
}
-----------------------
Total 99 different words in the content!
5 and
4 the
4 i
4 a
3 as
2 with
2 who
2 to
2 time
2 sverak
2 son
2 s
2 old
2 of
2 it
2 in
2 his
2 czech
1 zdenek
1 year
1 wrote
1 writing
1 won
1 whining
1 while
1 wanted
1 walked
1 ve
1 values
1 though
1 this
1 these
1 that
1 than
1 taking
1 subtitles
1 spend
1 some
1 so
1 seen
1 script
1 saw
1 russian
1 richest
1 remain
1 rather
1 production
1 plays
1 oscar
1 one
1 not
1 more
1 m
1 likely
1 life
1 language
1 kolya
1 jan
1 is
1 increasingly
1 impacted
1 if
1 higher
1 high
1 he
1 golden
1 globe
1 foreign
1 for
1 five
1 finds
1 films
1 film
1 father
1 english
1 ends
1 dramas
1 directed
1 delight
1 days
1 couple
1 confirmed
1 comparable
1 characters
1 cellist
1 cause
1 care
1 by
1 boy
1 best
1 bachelor
1 away
1 are
1 an
1 american
1 also
1 after
1 acting
1 abruptly
再问: 加上注释行不?谢谢(2)在上面的基础上完成从文件夹中读取所有文章,输出每篇文章词频最高的10个词。追加50分
再答: 长度限制,增加构造方法( 1) int[] wordFreqs; public Article(File file) throws IOException{ BufferedReader bf = new BufferedReader(new FileReader(file)); String lineContent = ""; StringBuilder sb = new StringBuilder(); while(lineContent != null){ lineContent = bf.readLine(); if(lineContent == null){ break; } sb.append(lineContent).append(" "); } content = sb.toString(); } (2)重写 public void printResult() { System.out.println("Total " + words.length + " different words in the content!"); for(int i = 0, j = 1; i < words.length; i++){ if(j++ > 10){ break; } System.out.println(wordFreqs[i] + " " + words[i]); } } (3) 重写main方法 public static void main(String[] args) throws IOException { File file = new File("C://test");//测试文件夹 if(!file.isDirectory()){ throw new IOException("It should be a directory!"); } File[] files = file.listFiles(); for(File fl: files){ if(fl.isFile()){ String absolutePath = fl.getAbsolutePath(); System.out.println("For file \"" + absolutePath + "\", the top 10 words are: "); Article a = new Article(fl); a.splitWord(); a.countWordFreq(); a.sort(); a.printResult(); } } } (4) ----------测试结果 For file "C:\test\1.txt", the top 10 words are: Total 99 different words in the content! 5 and 4 the 4 i 4 a 3 as 2 with 2 who 2 to 2 time 2 sverak For file "C:\test\3.txt", the top 10 words are: Total 99 different words in the content! 5 and 4 the 4 i 4 a 3 as 2 with 2 who 2 to 2 time 2 sverak For file "C:\test\a.txt", the top 10 words are: Total 9 different words in the content! 2 bb 1 m 1 fff 1 ef 1 ee 1 cc 1 c 1 adsl 1 a For file "C:\test\b.txt", the top 10 words are: Total 99 different words in the content! 5 and 4 the 4 i 4 a 3 as 2 with 2 who 2 to 2 time 2 sverak
用JAVA语言设计一个类,统计一篇英文文章的词频,并按照词频由高到低输出.修改下面代码就行了. 用Java语言设计一个程序,从键盘输入3个整数,按照从小到大的顺序输出.是用Java语言哦, 请教一下有没有支持中文的词频统计软件?比如一篇WORD中出现... 有晓得的人就说下哈,真心谢谢大伙了 请问一下谁知道有没有支持中文的词频统计软件?比如一篇WORD中出现... 熟悉的看下吧,打心底麻烦各位了 请教大家下有没有支持中文的词频统计软件?比如一篇WORD中出现...有会的人说下嘛, 请问下大家知道有没有支持中文的词频统计软件?比如一篇WORD中出现... 英语单词词频统计软件怎么样把十年考研英语完形填空所有选项的单词做一次词频统计,把相同的单词出现次数统计出来? EXCEL统计问题如图将1-10行的数据全部转成词号-词频两列的形式,并统计出每个词号出现的次数.对于重复出现的词号,词 用c语言编程,要求从键盘读入一个由单词和空格组成的英文长句,分解其中单词,并按照字典顺序排列输出 中国知网查询文献的选框里有个“词频”是什么意思?应该怎么用?还有一个相近词选框,选了一些词之后,检索的文献比原来少了! 请使用java语言编写一段程序,统计这段文字中单词的个数,并输出其中由四个字母组成的单词 JAVA统计一篇文章中所有单词出现的次数,并按字典序将单词及频数输出到文件中