Master Thesis: Thesis
Results: Results
Master Thesis resources
File name: Introduction to Information Retrieval.pdf
Description: Kapitola 13 (Text classification and naive Bayes), strana 253 – 288 v knize.
Nějaký úvod, využití v různých oblastech (detekce spamu, sentimentu atd..), definice problému klasifikace dokumentů, Bayesovský klasifikátor - detailní popis včetně pseudoalgoritmu včetně výsledků a možnných vylepšení. Zajímavá strana je od 271 dál, popisuje metodu výběru příznaků, metriku "Mutual Information" udávající jak konkrétní příznak přispívá ke správné klasifikaci, zase včetně ukázek. Dále popisuje různé statistiky jako příznakový vektor (například chi kvadrát) a ruzne další metody použitelné pro klasifikaci. Následuje vyhodnocení jednotlivých přístupů a nakonec spousta literatury.
Bibtex:
@book{manning08,
    author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch\"{u}tze, Hinrich},
    day = {07},
    edition = {1},
    howpublished = {Hardcover},
    isbn = {0521865719},
    keywords = {information-retrieval},
    month = jul,
    posted-at = {2008-08-06 22:01:26},
    priority = {3},
    publisher = {Cambridge University Press},
    title = {{Introduction to Information Retrieval}},
    url = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/0521865719},
    year = {2008}
}       
 
File name: Feature selection on hierarchy of web documents.pdf
Description: Spousta užitečných vzorečků pro zjištění jak hodně příznak ovlivňuje klasifikaci, včetně postupu jak vybrat ty nejlepší včetně srovnání výsledků. Podrobný popis problému klasikace dokumentů
Bibtex:
    
@article{citeulike:4983972,
    author = {Mladenic, D.},
    doi = {10.1016/S0167-9236(02)00097-0},
    issn = {01679236},
    journal = {Decision Support Systems},
    keywords = {datamining, ontology},
    month = apr,
    number = {1},
    pages = {45--87},
    posted-at = {2009-06-27 02:15:42},
    priority = {3},
    title = {{Feature selection on hierarchy of web documents}},
    url = {http://dx.doi.org/10.1016/S0167-9236(02)00097-0},
    volume = {35},
    year = {2003}
}


       
 
File name: Exploiting structural information for semi-structured document categorization.pdf
Description: Stručně Bayesovský klasikátor a SVM klasifikátor. Uvádim ho jen kvuli SVM klasifikátoru a referencí na něj.
Bibtex:
 
@INPROCEEDINGS{Bratko04exploitingstructural,
    author = {Andrej Bratko and Bogdan Filipič},
    title = {Exploiting structural information for semi-structured document categorization},
    booktitle = {Information Processing & Management},
    year = {2004},
    pages = {679--694}
}
       
 
File name: Multiple sets of features for automatic genre classification of web documents.pdf
Description: Využití syntaktických a lexikálních informací pro klasifikaci
Bibtex:
 
@article{Lim05Genre,
    author = {Lim, C. and Lee, K. and Kim, G.},
    comment = {features include: URL features, HTML tags, token statistics (word, POS, symbols), selected term features,
    linguistically-motivated structural information. Target is web},
    doi = {10.1016/j.ipm.2004.06.004},
    issn = {03064573},
    journal = {Information Processing \& Management},
    keywords = {genre},
    month = sep,
    number = {5},
    pages = {1263--1276},
    posted-at = {2011-01-26 05:36:10},
    priority = {0},
    title = {{Multiple sets of features for automatic genre classification of web documents}},
    url = {http://dblab.mgt.ncu.edu.tw/\%E6\%95\%99\%E6\%9D\%90/2005\%20DM/57.pdf},
    volume = {41},
    year = {2005}
}


       
 
File name: Intelligent document classification.pdf
Description: Spíš postup obecné úlohy klasifikace dokumentů, dobrý návod jak začít. Na závěr srovnání klasifikátorů SVM, kNN, NNnet a NB (bayes)
Bibtex:
@article{Calvo2000,
    author = {Calvo, R. A. and Ceccatto, H. A.},
    citeulike-article-id = {1530595},
    journal = {Journal of Intelligent Data Analysis},
    keywords = {bibtex-import},
    number = {5},
    pages = {411--420},
    posted-at = {2007-08-02 14:05:22},
    priority = {0},
    title = {{Intelligent document classification}},
    volume = {4},
    year = {2000}
}


       
 
File name: Feature selection with dynamic mutual information.pdf
Description: Popis dynamické metody výběru příznaků.
Bibtex:
    
@article{Liu_Sun_Liu_Zhang_2009, 
     title={Feature selection with dynamic mutual information}, 
     volume={42}, 
     url={http://linkinghub.elsevier.com/retrieve/pii/S0031320308004615}, 
     number={7}, journal={Pattern Recognition}, 
     author={Liu, Huawen and Sun, Jigui and Liu, Lei and Zhang, Huijie}, 
     year={2009}, 
     pages={1330--1339}
}
       
 
File name: A two-stage feature selection method for text categorization.pdf
Description: Jiný popis výběru příznaků včetně srovnání jednotlivých přístupů.
Bibtex:
  
@article{DBLP:journals/kbs/Uguz11,
  author    = {Harun Uguz},
  title     = {A two-stage feature selection method for text categorization
               by using information gain, principal component analysis
               and genetic algorithm},
  journal   = {Knowl.-Based Syst.},
  volume    = {24},
  number    = {7},
  year      = {2011},
  pages     = {1024-1032},
  ee        = {http://dx.doi.org/10.1016/j.knosys.2011.04.014},
  bibsource = {DBLP, http://dblp.uni-trier.de}
}
       
 
File name: Automatic classification using supervised learning in a medical document filtering application.pdf
Description: Dobrý popis klasifikační úlohy včetně podrobného návodu.
Bibtex:
 
@article{journals/ipm/MostafaL00,
  author = {Mostafa, Javed and Lam, Wai},
  interhash = {70f81efd57e5c13aa6fbd401f27ea819},
  intrahash = {eae677fbb1abde3617534fe69e791cfb},
  journal = {Inf. Process. Manage.},
  number = 3,
  pages = {415-444},
  title = {Automatic classification using supervised learning in a medical document filtering application.},
  url = {http://dblp.uni-trier.de/db/journals/ipm/ipm36.html#MostafaL00},
  volume = 36,
  year = 2000,
  timestamp = {2011-07-08T00:00:00.000+0200},
  keywords = {dblp},
  ee = {http://dx.doi.org/10.1016/S0306-4573(99)00033-3},
  added-at = {2011-07-08T00:00:00.000+0200},
  biburl = {http://www.bibsonomy.org/bibtex/2eae677fbb1abde3617534fe69e791cfb/dblp}
}


       
 
 
File name: An Extensive Empirical Study of Feature Selection Metrics for Text Classification.pdf
Description: Různé metriky pro ověření správnosti vybraných příznaků a jejich porovnání.
Bibtex:
 
@ARTICLE{Forman03anextensive,
    author = {George Forman and Isabelle Guyon and André Elisseeff},
    title = {An extensive empirical study of feature selection metrics for text classification},
    journal = {Journal of Machine Learning Research},
    year = {2003},
    volume = {3},
    pages = {1289--1305}
}
       
 
File name: Hierarchically SVM classification based on support vector clustering method and its application to document categorization.pdf
Description: Dobrý popis SVM klasifikátoru
Bibtex:
 
@article{Hao:2007:HSC:1230143.1230212,
 author = {Hao, Pei-Yi and Chiang, Jung-Hsien and Tu, Yi-Kun},
 title = {Hierarchically SVM classification based on support vector clustering method and its application to document categorization},
 journal = {Expert Syst. Appl.},
 volume = {33},
 issue = {3},
 month = {October},
 year = {2007},
 issn = {0957-4174},
 pages = {627--635},
 numpages = {9},
 url = {http://dx.doi.org/10.1016/j.eswa.2006.06.009},
 doi = {http://dx.doi.org/10.1016/j.eswa.2006.06.009},
 acmid = {1230212},
 publisher = {Pergamon Press, Inc.},
 address = {Tarrytown, NY, USA},
 } 
       
 
File name: Automatically computed document dependent weighting factor facility for Naïve Bayes classification.pdf
Description: Další metrika pro výpočet kvality vybraných příznaků pro bayesovský klasfikátor
Bibtex:
      
@article{citeulike:7278202,
    author = {Lee, Lam H. and Isa, Dino},
    day = {02},
    doi = {10.1016/j.eswa.2010.05.030},
    issn = {09574174},
    journal = {Expert Systems with Applications},
    keywords = {bayes, classification, dependent, document, naive, weighting},
    month = dec,
    number = {12},
    pages = {8471--8478},
    posted-at = {2011-02-27 07:40:42},
    priority = {3},
    title = {{Automatically computed document dependent weighting factor facility for Na\"{i}ve Bayes classification}},
    url = {http://dx.doi.org/10.1016/j.eswa.2010.05.030},
    volume = {37},
    year = {2010}
}
       
 
File name: .pdf
Description:
Bibtex: