diff --git a/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb b/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb new file mode 100644 index 0000000..d513d97 --- /dev/null +++ b/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb @@ -0,0 +1,2558 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Probability Based " + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import requests\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ASCII" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "all_content = open('80k_articles.txt',encoding='UTF-8').read()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "34475997" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_content)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'新华社照片,东莞(广东),2017年4月7日\\\\n(体育)(9)篮球——CBA总决赛第四场:广东对阵新疆\\\\n4月7日,广东东莞银行队球员易建联在比赛中扣篮。\\\\n当日,在2016-2017赛季中国男子篮球职业联赛(CBA)总决赛第四场比赛中,广东东莞银行队主场迎战新疆喀什古城队。\\\\n新华社记者孟永民摄\\\\n\\u3000\\u3000新华社北京4月14日新媒体专电(记者杨烨)作为国民经济的重要支柱,央企一季度交上了一份漂亮的“'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_content[:200]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize(string): \n", + " #return ''.join(re.findall('[\\w|\\d]+', string)) # 不太明白两种写法的区别,w也能匹配数字,但输出结果是有不同\n", + " return ''.join(re.findall('\\w+', string))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'新华社照片东莞广东2017年4月7日n体育9篮球CBA总决赛第四场广东对阵新疆n4月7日广东东莞银行队球员易建联在比赛中扣篮n当日在20162017赛季中国男子篮球职业联赛CBA总决赛第四场比赛中广东东莞银行队主场迎战新疆喀什古城队n新华社记者孟永民摄n新华社北京4月14日新媒体专电记者杨烨作为国民经济的重要支柱央企一季度交上了一份漂亮的'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenize(all_content[:200])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "29733817" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ALL_CHARACTER = tokenize(all_content)\n", + "len(ALL_CHARACTER)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unigram" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$P(W_oW_1W_2Wn) = P(W_o) \\cdot P(W_1) \\cdot P(W_2) \\cdot P(W_n) $" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "all_character_counts = Counter(ALL_CHARACTER)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('的', 635684),\n", + " ('n', 605563),\n", + " ('国', 303683),\n", + " ('1', 285430),\n", + " ('在', 273451),\n", + " ('一', 255874),\n", + " ('中', 249541),\n", + " ('日', 248419),\n", + " ('2', 247140),\n", + " ('新', 243975),\n", + " ('0', 240159),\n", + " ('年', 197627),\n", + " ('月', 183696),\n", + " ('人', 176780),\n", + " ('大', 162508),\n", + " ('社', 159861),\n", + " ('华', 156763),\n", + " ('是', 141034),\n", + " ('和', 131350),\n", + " ('赛', 130048),\n", + " ('发', 129080),\n", + " ('有', 128925),\n", + " ('为', 126278),\n", + " ('5', 124931),\n", + " ('了', 124569),\n", + " ('行', 122928),\n", + " ('7', 121544),\n", + " ('上', 118416),\n", + " ('外', 117495),\n", + " ('4', 112639),\n", + " ('业', 112051),\n", + " ('不', 112042),\n", + " ('会', 109898),\n", + " ('代', 100566),\n", + " ('地', 96026),\n", + " ('球', 92521),\n", + " ('时', 92322),\n", + " ('3', 92131),\n", + " ('者', 91667),\n", + " ('作', 91293),\n", + " ('以', 91191),\n", + " ('家', 89815),\n", + " ('成', 89485),\n", + " ('对', 87199),\n", + " ('6', 86963),\n", + " ('市', 85776),\n", + " ('来', 85467),\n", + " ('出', 85263),\n", + " ('生', 84895),\n", + " ('个', 83544),\n", + " ('这', 81872),\n", + " ('公', 81351),\n", + " ('比', 80433),\n", + " ('动', 80430),\n", + " ('线', 80196),\n", + " ('全', 79296),\n", + " ('体', 79271),\n", + " ('二', 79008),\n", + " ('进', 78995),\n", + " ('开', 78979),\n", + " ('1', 78756),\n", + " ('0', 75949),\n", + " ('多', 75399),\n", + " ('学', 73616),\n", + " ('队', 73414),\n", + " ('到', 73247),\n", + " ('斯', 72799),\n", + " ('法', 72555),\n", + " ('合', 72090),\n", + " ('展', 71993),\n", + " ('要', 71584),\n", + " ('场', 71047),\n", + " ('方', 70982),\n", + " ('部', 70644),\n", + " ('记', 69959),\n", + " ('工', 69432),\n", + " ('经', 67717),\n", + " ('分', 66855),\n", + " ('能', 65109),\n", + " ('区', 64966),\n", + " ('前', 64792),\n", + " ('于', 64751),\n", + " ('民', 64686),\n", + " ('高', 64161),\n", + " ('当', 64148),\n", + " ('美', 64016),\n", + " ('将', 63416),\n", + " ('2', 62896),\n", + " ('与', 62647),\n", + " ('员', 62067),\n", + " ('加', 60772),\n", + " ('后', 60761),\n", + " ('片', 60694),\n", + " ('现', 60591),\n", + " ('利', 60337),\n", + " ('主', 60068),\n", + " ('产', 58963),\n", + " ('等', 58799),\n", + " ('机', 58508),\n", + " ('联', 58457),\n", + " ('特', 57946),\n", + " ('长', 57577),\n", + " ('化', 57341),\n", + " ('电', 57128),\n", + " ('用', 56850),\n", + " ('尔', 56778),\n", + " ('自', 56418),\n", + " ('建', 56356),\n", + " ('照', 55866),\n", + " ('下', 55840),\n", + " ('实', 55554),\n", + " ('过', 55069),\n", + " ('海', 54110),\n", + " ('路', 54097),\n", + " ('力', 53780),\n", + " ('北', 53433),\n", + " ('他', 53304),\n", + " ('第', 52636),\n", + " ('政', 52538),\n", + " ('西', 52510),\n", + " ('关', 52252),\n", + " ('文', 51815),\n", + " ('重', 51789),\n", + " ('同', 51457),\n", + " ('说', 50644),\n", + " ('里', 49866),\n", + " ('通', 49858),\n", + " ('内', 49753),\n", + " ('资', 49587),\n", + " ('8', 49314),\n", + " ('表', 49250),\n", + " ('9', 49168),\n", + " ('理', 49021),\n", + " ('我', 48966),\n", + " ('们', 48171),\n", + " ('本', 47561),\n", + " ('平', 47322),\n", + " ('天', 47322),\n", + " ('务', 46919),\n", + " ('金', 46747),\n", + " ('手', 46697),\n", + " ('亚', 46635),\n", + " ('目', 46335),\n", + " ('小', 46201),\n", + " ('面', 46037),\n", + " ('安', 45462),\n", + " ('子', 45203),\n", + " ('事', 44971),\n", + " ('名', 44969),\n", + " ('得', 44538),\n", + " ('网', 44437),\n", + " ('可', 44367),\n", + " ('际', 44319),\n", + " ('也', 43926),\n", + " ('设', 43772),\n", + " ('之', 43382),\n", + " ('选', 43013),\n", + " ('摄', 42876),\n", + " ('制', 42556),\n", + " ('科', 42495),\n", + " ('度', 42073),\n", + " ('品', 41941),\n", + " ('次', 41782),\n", + " ('马', 41496),\n", + " ('定', 41222),\n", + " ('入', 41218),\n", + " ('提', 41007),\n", + " ('德', 40949),\n", + " ('总', 40914),\n", + " ('最', 40389),\n", + " ('育', 40317),\n", + " ('车', 40197),\n", + " ('举', 39934),\n", + " ('就', 39834),\n", + " ('元', 39741),\n", + " ('示', 39711),\n", + " ('交', 39632),\n", + " ('期', 39215),\n", + " ('保', 39078),\n", + " ('心', 38963),\n", + " ('战', 38851),\n", + " ('京', 38809),\n", + " ('其', 38756),\n", + " ('级', 38659),\n", + " ('基', 37846),\n", + " ('拉', 37294),\n", + " ('南', 37176),\n", + " ('从', 36985),\n", + " ('报', 36816),\n", + " ('点', 36691),\n", + " ('间', 36577),\n", + " ('5', 36511),\n", + " ('都', 36440),\n", + " ('东', 36180),\n", + " ('水', 35795),\n", + " ('数', 35509),\n", + " ('三', 35431),\n", + " ('带', 35418),\n", + " ('首', 35272),\n", + " ('道', 35174),\n", + " ('两', 35000),\n", + " ('明', 34903),\n", + " ('已', 34892),\n", + " ('强', 34866),\n", + " ('统', 34786),\n", + " ('据', 34750),\n", + " ('台', 34330),\n", + " ('及', 34105),\n", + " ('布', 33948),\n", + " ('克', 33941),\n", + " ('计', 33907),\n", + " ('好', 33641),\n", + " ('城', 33555),\n", + " ('相', 33482),\n", + " ('共', 33316),\n", + " ('万', 33204),\n", + " ('巴', 33195),\n", + " ('3', 32998),\n", + " ('近', 32909),\n", + " ('题', 32907),\n", + " ('完', 32733),\n", + " ('系', 32690),\n", + " ('技', 32676),\n", + " ('世', 32568),\n", + " ('军', 32536),\n", + " ('足', 32383),\n", + " ('物', 32362),\n", + " ('位', 32256),\n", + " ('省', 32180),\n", + " ('决', 32050),\n", + " ('还', 32004),\n", + " ('司', 31879),\n", + " ('项', 31854),\n", + " ('企', 31649),\n", + " ('持', 31644),\n", + " ('意', 31635),\n", + " ('院', 31594),\n", + " ('活', 31577),\n", + " ('式', 31554),\n", + " ('创', 31538),\n", + " ('山', 31101),\n", + " ('组', 31088),\n", + " ('而', 30789),\n", + " ('4', 30775),\n", + " ('并', 30737),\n", + " ('正', 30677),\n", + " ('罗', 30349),\n", + " ('胜', 30287),\n", + " ('量', 30214),\n", + " ('性', 30081),\n", + " ('运', 30080),\n", + " ('此', 29974),\n", + " ('欧', 29798),\n", + " ('起', 29662),\n", + " ('州', 29622),\n", + " ('管', 29514),\n", + " ('更', 29438),\n", + " ('信', 29428),\n", + " ('着', 29308),\n", + " ('达', 29094),\n", + " ('影', 28990),\n", + " ('增', 28861),\n", + " ('术', 28848),\n", + " ('所', 28706),\n", + " ('至', 28630),\n", + " ('标', 28371),\n", + " ('推', 28101),\n", + " ('局', 27813),\n", + " ('广', 27800),\n", + " ('规', 27790),\n", + " ('向', 27788),\n", + " ('门', 27550),\n", + " ('济', 27487),\n", + " ('参', 27433),\n", + " ('节', 27414),\n", + " ('立', 27286),\n", + " ('6', 27259),\n", + " ('任', 27244),\n", + " ('受', 26933),\n", + " ('收', 26795),\n", + " ('投', 26778),\n", + " ('造', 26668),\n", + " ('服', 26660),\n", + " ('应', 26553),\n", + " ('商', 26546),\n", + " ('被', 26528),\n", + " ('格', 26437),\n", + " ('今', 26434),\n", + " ('界', 26408),\n", + " ('游', 26203),\n", + " ('议', 26120),\n", + " ('程', 25956),\n", + " ('接', 25880),\n", + " ('改', 25851),\n", + " ('专', 25816),\n", + " ('研', 25749),\n", + " ('教', 25693),\n", + " ('种', 25654),\n", + " ('调', 25616),\n", + " ('各', 25494),\n", + " ('装', 25332),\n", + " ('但', 25253),\n", + " ('如', 25127),\n", + " ('问', 24988),\n", + " ('件', 24561),\n", + " ('领', 24550),\n", + " ('单', 24509),\n", + " ('村', 24411),\n", + " ('办', 24316),\n", + " ('约', 24283),\n", + " ('无', 24199),\n", + " ('情', 23988),\n", + " ('江', 23945),\n", + " ('英', 23881),\n", + " ('超', 23861),\n", + " ('回', 23848),\n", + " ('价', 23835),\n", + " ('因', 23746),\n", + " ('打', 23537),\n", + " ('导', 23509),\n", + " ('7', 23455),\n", + " ('纳', 23389),\n", + " ('解', 23324),\n", + " ('由', 22875),\n", + " ('指', 22836),\n", + " ('处', 22769),\n", + " ('看', 22754),\n", + " ('去', 22665),\n", + " ('兰', 22624),\n", + " ('港', 22593),\n", + " ('周', 22566),\n", + " ('传', 22363),\n", + " ('季', 22280),\n", + " ('治', 22256),\n", + " ('流', 21957),\n", + " ('士', 21900),\n", + " ('环', 21874),\n", + " ('图', 21869),\n", + " ('支', 21807),\n", + " ('女', 21799),\n", + " ('团', 21758),\n", + " ('阿', 21690),\n", + " ('集', 21657),\n", + " ('些', 21565),\n", + " ('查', 21505),\n", + " ('施', 21486),\n", + " ('空', 21486),\n", + " ('常', 21421),\n", + " ('客', 21191),\n", + " ('然', 21137),\n", + " ('结', 21055),\n", + " ('农', 21023),\n", + " ('果', 21022),\n", + " ('深', 21015),\n", + " ('委', 20972),\n", + " ('四', 20893),\n", + " ('放', 20877),\n", + " ('续', 20865),\n", + " ('府', 20747),\n", + " ('园', 20570),\n", + " ('尼', 20483),\n", + " ('步', 20406),\n", + " ('普', 20395),\n", + " ('口', 20387),\n", + " ('房', 20374),\n", + " ('张', 20252),\n", + " ('认', 20252),\n", + " ('获', 20231),\n", + " ('医', 20167),\n", + " ('原', 19897),\n", + " ('风', 19890),\n", + " ('林', 19668),\n", + " ('9', 19539),\n", + " ('供', 19451),\n", + " ('米', 19444),\n", + " ('维', 19156),\n", + " ('河', 19093),\n", + " ('易', 19051),\n", + " ('让', 18991),\n", + " ('求', 18937),\n", + " ('老', 18832),\n", + " ('众', 18784),\n", + " ('党', 18783),\n", + " ('户', 18696),\n", + " ('股', 18692),\n", + " ('冠', 18687),\n", + " ('书', 18537),\n", + " ('少', 18531),\n", + " ('轮', 18521),\n", + " ('案', 18512),\n", + " ('身', 18484),\n", + " ('源', 18452),\n", + " ('8', 18415),\n", + " ('十', 18294),\n", + " ('境', 18276),\n", + " ('息', 18150),\n", + " ('航', 18139),\n", + " ('头', 18133),\n", + " ('演', 18127),\n", + " ('使', 18070),\n", + " ('很', 18023),\n", + " ('没', 17924),\n", + " ('观', 17900),\n", + " ('取', 17865),\n", + " ('花', 17858),\n", + " ('未', 17845),\n", + " ('协', 17760),\n", + " ('告', 17706),\n", + " ('证', 17683),\n", + " ('视', 17650),\n", + " ('龙', 17567),\n", + " ('非', 17559),\n", + " ('王', 17558),\n", + " ('引', 17541),\n", + " ('升', 17446),\n", + " ('亿', 17408),\n", + " ('样', 17403),\n", + " ('每', 17342),\n", + " ('融', 17341),\n", + " ('需', 17287),\n", + " ('费', 17284),\n", + " ('究', 17201),\n", + " ('营', 17178),\n", + " ('构', 17159),\n", + " ('准', 17056),\n", + " ('校', 16909),\n", + " ('别', 16887),\n", + " ('卡', 16854),\n", + " ('先', 16823),\n", + " ('师', 16757),\n", + " ('县', 16721),\n", + " ('李', 16670),\n", + " ('考', 16648),\n", + " ('助', 16592),\n", + " ('站', 16555),\n", + " ('域', 16528),\n", + " ('气', 16501),\n", + " ('色', 16442),\n", + " ('预', 16440),\n", + " ('变', 16392),\n", + " ('该', 16343),\n", + " ('权', 16305),\n", + " ('显', 16304),\n", + " ('型', 16299),\n", + " ('备', 16240),\n", + " ('护', 16218),\n", + " ('转', 16188),\n", + " ('列', 16133),\n", + " ('只', 16104),\n", + " ('走', 16046),\n", + " ('击', 15931),\n", + " ('模', 15924),\n", + " ('责', 15872),\n", + " ('整', 15811),\n", + " ('做', 15790),\n", + " ('给', 15697),\n", + " ('青', 15670),\n", + " ('快', 15616),\n", + " ('A', 15615),\n", + " ('双', 15611),\n", + " ('号', 15539),\n", + " ('俄', 15474),\n", + " ('苏', 15413),\n", + " ('届', 15399),\n", + " ('直', 15396),\n", + " ('势', 15339),\n", + " ('包', 15301),\n", + " ('习', 15237),\n", + " ('划', 15188),\n", + " ('质', 15162),\n", + " ('称', 15095),\n", + " ('媒', 15060),\n", + " ('互', 15052),\n", + " ('乐', 15025),\n", + " ('极', 14918),\n", + " ('消', 14908),\n", + " ('率', 14899),\n", + " ('监', 14878),\n", + " ('香', 14868),\n", + " ('条', 14847),\n", + " ('态', 14767),\n", + " ('类', 14673),\n", + " ('越', 14658),\n", + " ('论', 14615),\n", + " ('晋', 14570),\n", + " ('光', 14554),\n", + " ('古', 14543),\n", + " ('博', 14521),\n", + " ('警', 14482),\n", + " ('伊', 14482),\n", + " ('优', 14459),\n", + " ('飞', 14416),\n", + " ('积', 14333),\n", + " ('清', 14329),\n", + " ('验', 14319),\n", + " ('铁', 14290),\n", + " ('织', 14262),\n", + " ('防', 14248),\n", + " ('难', 14150),\n", + " ('儿', 14147),\n", + " ('武', 14115),\n", + " ('效', 14088),\n", + " ('形', 14060),\n", + " ('闻', 14004),\n", + " ('落', 13881),\n", + " ('确', 13809),\n", + " ('速', 13805),\n", + " ('洲', 13745),\n", + " ('男', 13723),\n", + " ('银', 13703),\n", + " ('反', 13672),\n", + " ('夫', 13495),\n", + " ('五', 13469),\n", + " ('始', 13446),\n", + " ('想', 13425),\n", + " ('策', 13424),\n", + " ('旅', 13406),\n", + " ('奥', 13404),\n", + " ('贫', 13363),\n", + " ('土', 13332),\n", + " ('试', 13326),\n", + " ('精', 13323),\n", + " ('或', 13316),\n", + " ('具', 13281),\n", + " ('席', 13270),\n", + " ('知', 13176),\n", + " ('感', 13175),\n", + " ('卫', 13154),\n", + " ('见', 13129),\n", + " ('较', 13117),\n", + " ('采', 13063),\n", + " ('访', 13042),\n", + " ('庆', 12920),\n", + " ('己', 12916),\n", + " ('白', 12848),\n", + " ('热', 12717),\n", + " ('群', 12707),\n", + " ('段', 12686),\n", + " ('艺', 12668),\n", + " ('革', 12646),\n", + " ('连', 12646),\n", + " ('景', 12616),\n", + " ('再', 12615),\n", + " ('历', 12595),\n", + " ('限', 12555),\n", + " ('才', 12525),\n", + " ('班', 12522),\n", + " ('话', 12514),\n", + " ('把', 12489),\n", + " ('望', 12446),\n", + " ('希', 12412),\n", + " ('根', 12378),\n", + " ('黄', 12365),\n", + " ('份', 12351),\n", + " ('范', 12335),\n", + " ('器', 12320),\n", + " ('边', 12264),\n", + " ('职', 12249),\n", + " ('她', 12240),\n", + " ('福', 12172),\n", + " ('星', 12124),\n", + " ('半', 12118),\n", + " ('注', 12104),\n", + " ('致', 12100),\n", + " ('干', 12035),\n", + " ('益', 11935),\n", + " ('复', 11891),\n", + " ('阳', 11860),\n", + " ('况', 11833),\n", + " ('朗', 11769),\n", + " ('友', 11755),\n", + " ('低', 11737),\n", + " ('湖', 11668),\n", + " ('馆', 11653),\n", + " ('纪', 11645),\n", + " ('负', 11636),\n", + " ('义', 11562),\n", + " ('火', 11537),\n", + " ('牌', 11521),\n", + " ('那', 11511),\n", + " ('练', 11494),\n", + " ('何', 11463),\n", + " ('功', 11405),\n", + " ('检', 11405),\n", + " ('波', 11401),\n", + " ('随', 11383),\n", + " ('值', 11350),\n", + " ('险', 11338),\n", + " ('批', 11328),\n", + " ('智', 11325),\n", + " ('降', 11297),\n", + " ('响', 11265),\n", + " ('介', 11239),\n", + " ('健', 11207),\n", + " ('韩', 11187),\n", + " ('排', 11132),\n", + " ('住', 11094),\n", + " ('容', 11084),\n", + " ('断', 11071),\n", + " ('塞', 10974),\n", + " ('养', 10963),\n", + " ('百', 10927),\n", + " ('仅', 10889),\n", + " ('存', 10861),\n", + " ('贸', 10859),\n", + " ('沙', 10794),\n", + " ('争', 10792),\n", + " ('则', 10792),\n", + " ('石', 10780),\n", + " ('春', 10766),\n", + " ('控', 10730),\n", + " ('字', 10709),\n", + " ('爱', 10694),\n", + " ('严', 10623),\n", + " ('承', 10622),\n", + " ('e', 10570),\n", + " ('镇', 10553),\n", + " ('评', 10538),\n", + " ('几', 10503),\n", + " ('核', 10454),\n", + " ('油', 10410),\n", + " ('陈', 10407),\n", + " ('曼', 10359),\n", + " ('史', 10349),\n", + " ('塔', 10333),\n", + " ('汉', 10315),\n", + " ('红', 10309),\n", + " ('购', 10299),\n", + " ('往', 10182),\n", + " ('识', 10152),\n", + " ('切', 10130),\n", + " ('余', 10077),\n", + " ('察', 10051),\n", + " ('货', 10042),\n", + " ('太', 10019),\n", + " ('宣', 10013),\n", + " ('么', 9988),\n", + " ('播', 9944),\n", + " ('坚', 9939),\n", + " ('C', 9927),\n", + " ('印', 9927),\n", + " ('宁', 9901),\n", + " ('财', 9895),\n", + " ('央', 9877),\n", + " ('哈', 9872),\n", + " ('象', 9872),\n", + " ('奇', 9865),\n", + " ('终', 9850),\n", + " ('病', 9832),\n", + " ('透', 9812),\n", + " ('a', 9811),\n", + " ('刘', 9798),\n", + " ('官', 9797),\n", + " ('右', 9785),\n", + " ('左', 9784),\n", + " ('副', 9738),\n", + " ('款', 9719),\n", + " ('围', 9711),\n", + " ('富', 9666),\n", + " ('真', 9639),\n", + " ('居', 9637),\n", + " ('依', 9610),\n", + " ('食', 9609),\n", + " ('训', 9596),\n", + " ('志', 9559),\n", + " ('远', 9557),\n", + " ('继', 9541),\n", + " ('神', 9458),\n", + " ('奖', 9442),\n", + " ('均', 9383),\n", + " ('言', 9374),\n", + " ('涨', 9372),\n", + " ('哥', 9334),\n", + " ('执', 9330),\n", + " ('许', 9312),\n", + " ('突', 9299),\n", + " ('兴', 9291),\n", + " ('幕', 9287),\n", + " ('甲', 9269),\n", + " ('币', 9251),\n", + " ('底', 9243),\n", + " ('票', 9219),\n", + " ('破', 9211),\n", + " ('声', 9190),\n", + " ('销', 9168),\n", + " ('层', 9165),\n", + " ('锦', 9159),\n", + " ('药', 9139),\n", + " ('澳', 9128),\n", + " ('B', 9125),\n", + " ('孩', 9123),\n", + " ('黎', 9114),\n", + " ('族', 9069),\n", + " ('竞', 9046),\n", + " ('秀', 9025),\n", + " ('乡', 9004),\n", + " ('满', 9000),\n", + " ('岁', 8975),\n", + " ('离', 8965),\n", + " ('拍', 8942),\n", + " ('除', 8912),\n", + " ('减', 8904),\n", + " ('测', 8849),\n", + " ('善', 8838),\n", + " ('雷', 8810),\n", + " ('即', 8792),\n", + " ('萨', 8777),\n", + " ('索', 8739),\n", + " ('康', 8720),\n", + " ('峰', 8710),\n", + " ('I', 8692),\n", + " ('o', 8684),\n", + " ('绍', 8670),\n", + " ('盟', 8670),\n", + " ('洛', 8654),\n", + " ('篮', 8632),\n", + " ('困', 8618),\n", + " ('湾', 8612),\n", + " ('售', 8603),\n", + " ('鲁', 8589),\n", + " ('端', 8562),\n", + " ('律', 8553),\n", + " ('料', 8542),\n", + " ('候', 8525),\n", + " ('稿', 8516),\n", + " ('库', 8513),\n", + " ('瓦', 8494),\n", + " ('置', 8490),\n", + " ('勒', 8449),\n", + " ('托', 8441),\n", + " ('幅', 8432),\n", + " ('P', 8422),\n", + " ('培', 8368),\n", + " ('云', 8348),\n", + " ('审', 8341),\n", + " ('享', 8333),\n", + " ('什', 8326),\n", + " ('亲', 8309),\n", + " ('埃', 8304),\n", + " ('录', 8288),\n", + " ('失', 8280),\n", + " ('陆', 8207),\n", + " ('迎', 8206),\n", + " ('故', 8203),\n", + " ('又', 8182),\n", + " ('诺', 8176),\n", + " ('蒙', 8172),\n", + " ('曾', 8145),\n", + " ('森', 8133),\n", + " ('命', 8122),\n", + " ('稳', 8113),\n", + " ('威', 8102),\n", + " ('迪', 8090),\n", + " ('便', 8089),\n", + " ('配', 8082),\n", + " ('括', 8073),\n", + " ('伦', 8072),\n", + " ('冰', 8067),\n", + " ('念', 8061),\n", + " ('编', 8035),\n", + " ('i', 8017),\n", + " ('略', 7998),\n", + " ('黑', 7957),\n", + " ('留', 7929),\n", + " ('微', 7928),\n", + " ('坦', 7911),\n", + " ('疗', 7908),\n", + " ('愿', 7904),\n", + " ('谈', 7866),\n", + " ('修', 7843),\n", + " ('请', 7829),\n", + " ('担', 7823),\n", + " ('初', 7820),\n", + " ('额', 7815),\n", + " ('午', 7796),\n", + " ('夺', 7793),\n", + " ('牙', 7711),\n", + " ('救', 7694),\n", + " ('促', 7685),\n", + " ('启', 7660),\n", + " ('础', 7602),\n", + " ('例', 7601),\n", + " ('密', 7565),\n", + " ('岛', 7524),\n", + " ('临', 7521),\n", + " ('算', 7479),\n", + " ('欢', 7476),\n", + " ('岸', 7470),\n", + " ('按', 7449),\n", + " ('伤', 7442),\n", + " ('早', 7439),\n", + " ('占', 7428),\n", + " ('船', 7421),\n", + " ('遇', 7398),\n", + " ('吉', 7397),\n", + " ('思', 7371),\n", + " ('音', 7370),\n", + " ('鲜', 7362),\n", + " ('汽', 7354),\n", + " ('津', 7346),\n", + " ('登', 7330),\n", + " ('祝', 7329),\n", + " ('剧', 7325),\n", + " ('你', 7323),\n", + " ('姆', 7299),\n", + " ('紧', 7288),\n", + " ('讯', 7283),\n", + " ('补', 7257),\n", + " ('板', 7255),\n", + " ('死', 7248),\n", + " ('室', 7243),\n", + " ('络', 7213),\n", + " ('某', 7209),\n", + " ('读', 7207),\n", + " ('雨', 7092),\n", + " ('探', 7089),\n", + " ('诉', 7076),\n", + " ('讲', 7072),\n", + " ('脱', 7061),\n", + " ('税', 7061),\n", + " ('停', 7043),\n", + " ('涉', 7026),\n", + " ('杯', 7000),\n", + " ('仍', 6985),\n", + " ('摩', 6983),\n", + " ('违', 6952),\n", + " ('像', 6928),\n", + " ('必', 6910),\n", + " ('N', 6890),\n", + " ('朝', 6873),\n", + " ('温', 6862),\n", + " ('签', 6859),\n", + " ('障', 6834),\n", + " ('帮', 6833),\n", + " ('田', 6829),\n", + " ('松', 6827),\n", + " ('丝', 6813),\n", + " ('拿', 6801),\n", + " ('属', 6753),\n", + " ('移', 6745),\n", + " ('买', 6728),\n", + " ('洋', 6715),\n", + " ('千', 6708),\n", + " ('画', 6689),\n", + " ('宝', 6676),\n", + " ('角', 6648),\n", + " ('乌', 6625),\n", + " ('债', 6621),\n", + " ('轻', 6617),\n", + " ('八', 6597),\n", + " ('亮', 6586),\n", + " ('送', 6571),\n", + " ('店', 6563),\n", + " ('泰', 6550),\n", + " ('瑞', 6525),\n", + " ('舞', 6520),\n", + " ('跌', 6503),\n", + " ('绿', 6489),\n", + " ('判', 6488),\n", + " ('素', 6438),\n", + " ('申', 6422),\n", + " ('川', 6415),\n", + " ('待', 6404),\n", + " ('t', 6401),\n", + " ('晚', 6393),\n", + " ('夏', 6387),\n", + " ('攻', 6362),\n", + " ('坛', 6350),\n", + " ('r', 6345),\n", + " ('盘', 6344),\n", + " ('独', 6343),\n", + " ('吸', 6343),\n", + " ('贵', 6339),\n", + " ('逐', 6338),\n", + " ('童', 6322),\n", + " ('杨', 6318),\n", + " ('袭', 6305),\n", + " ('充', 6271),\n", + " ('载', 6228),\n", + " ('厂', 6193),\n", + " ('语', 6187),\n", + " ('假', 6146),\n", + " ('莫', 6138),\n", + " ('扶', 6136),\n", + " ('座', 6134),\n", + " ('挥', 6131),\n", + " ('压', 6105),\n", + " ('丽', 6083),\n", + " ('典', 6072),\n", + " ('综', 6062),\n", + " ('另', 6061),\n", + " ('丹', 6053),\n", + " ('眼', 6046),\n", + " ('六', 6046),\n", + " ('敦', 6041),\n", + " ('梅', 6038),\n", + " ('盛', 6033),\n", + " ('短', 6000),\n", + " ('木', 5991),\n", + " ('滑', 5989),\n", + " ('射', 5963),\n", + " ('刚', 5949),\n", + " ('毕', 5929),\n", + " ('莱', 5916),\n", + " ('母', 5902),\n", + " ('守', 5872),\n", + " ('庭', 5862),\n", + " ('付', 5861),\n", + " ('跑', 5854),\n", + " ('良', 5848),\n", + " ('且', 5846),\n", + " ('菲', 5834),\n", + " ('乒', 5831),\n", + " ('尽', 5796),\n", + " ('街', 5782),\n", + " ('歌', 5753),\n", + " ('纽', 5752),\n", + " ('督', 5732),\n", + " ('桥', 5729),\n", + " ('它', 5698),\n", + " ('措', 5692),\n", + " ('冲', 5692),\n", + " ('止', 5678),\n", + " ('署', 5673),\n", + " ('简', 5655),\n", + " ('令', 5650),\n", + " ('植', 5647),\n", + " ('够', 5631),\n", + " ('券', 5607),\n", + " ('雪', 5598),\n", + " ('丰', 5594),\n", + " ('映', 5591),\n", + " ('驻', 5564),\n", + " ('顿', 5542),\n", + " ('S', 5536),\n", + " ('疆', 5534),\n", + " ('细', 5528),\n", + " ('沿', 5527),\n", + " ('绩', 5520),\n", + " ('弹', 5520),\n", + " ('派', 5516),\n", + " ('罪', 5514),\n", + " ('毛', 5510),\n", + " ('牛', 5507),\n", + " ('状', 5501),\n", + " ('免', 5492),\n", + " ('害', 5492),\n", + " ('彩', 5472),\n", + " ('输', 5448),\n", + " ('退', 5444),\n", + " ('斗', 5434),\n", + " ('亡', 5430),\n", + " ('郑', 5409),\n", + " ('钟', 5407),\n", + " ('贝', 5394),\n", + " ('频', 5377),\n", + " ('贷', 5370),\n", + " ('架', 5357),\n", + " ('版', 5330),\n", + " ('梦', 5319),\n", + " ('兵', 5318),\n", + " ('杜', 5298),\n", + " ('宫', 5295),\n", + " ('叙', 5280),\n", + " ('恐', 5279),\n", + " ('套', 5271),\n", + " ('汇', 5267),\n", + " ('蒂', 5259),\n", + " ('茶', 5252),\n", + " ('招', 5243),\n", + " ('悉', 5230),\n", + " ('圣', 5216),\n", + " ('换', 5196),\n", + " ('犯', 5158),\n", + " ('草', 5155),\n", + " ('s', 5131),\n", + " ('适', 5129),\n", + " ('激', 5120),\n", + " ('戛', 5108),\n", + " ('耳', 5095),\n", + " ('觉', 5053),\n", + " ('遗', 5047),\n", + " ('延', 5035),\n", + " ('毒', 5027),\n", + " ('疑', 5025),\n", + " ('皇', 5018),\n", + " ('析', 5014),\n", + " ('M', 5011),\n", + " ('惠', 5009),\n", + " ('聚', 4995),\n", + " ('爆', 4992),\n", + " ('追', 4981),\n", + " ('顺', 4980),\n", + " ('劳', 4974),\n", + " ('征', 4965),\n", + " ('否', 4963),\n", + " ('却', 4961),\n", + " ('课', 4955),\n", + " ('齐', 4942),\n", + " ('野', 4919),\n", + " ...]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_character_counts.most_common()[0:100]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEMCAYAAADK231MAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xl8VNX5+PHPkz0kEPYtC/u+74obFhVQEbSi4IpS0fbrVqut7betttVv219bt9YNFREVcK9ocasUcUEh7PsqIWFLQsi+J+f3x7nRIU3CJDOZLc/79cormTt37n3mZOaZM+ece44YY1BKKRW6wvwdgFJKqealiV4ppUKcJnqllApxmuiVUirEaaJXSqkQp4leKaVCnCb6FkREuojIahEpEJG/NdM5DorIBc1xbBX4RMSISF8fnGeuiHzR3Oep59zfPUcReUZEfuOPOBojJBK9k1xKRKTQ5ae7v+MKQPOBbKCNMeZnnh5MRBaJyEOeh9W8fJV8VOAQkQUisltEqkVkbj37fCwiF3lyHmPMbcaYP3hyDF8IiUTvmG6MiXf5OVJ7BxGJ8EdgAaQHsMPoVXLKQ0HwXtoM/ATYUNedIhIHjAE+82VQfmOMCfof4CBwQR3bewIGmAccAlY7288AvgJysS+ISS6P6YX95xcAnwD/AF5x7psEZNR3buwH5/3AfuAE8DrQvlYsNzqxZAP/63KccOBXzmMLgPVAMvAk8Lda53wPuLuespgIrAPynN8Tne2LgAqgHCisp7wWOef7lxPDN0Cfes4zv9bx3nMpj3uBLU4MrwEx9RxjLvAl8KjzvzjgxD8XSAcygRtd9k8AFgNZQBrwayDMua+v83/Lc8r2NWf7aqfci5w4r64nlluAnc7z3gGMdrYPAlY58W0HLqtVXk8BHzjH/hLoCjwGnAR2AaNqvVbuc8qmCHgB6OI8vgD4N9DOZf/LnHPmOjEMqnUsd8s5DRjj/H2dUx6Dnds/Av7p/B3txH7E+XkMiHZ97QO/AI4BLzvb7wOOOvvf7By7bz1x3ORSxgeAW13uqzn+z5z/+1HgJpf7OwDLgXxgLfAH4As3csMXwNw6tl8GLHf+fhD7Xl3sxLYdGNvAMb97js5r4CE3n0M08Ffs+/848AwQ65Mc6YuTNPuTOH2iXwzEAbFAIjYJX4xNzBc6tzs5j1kDPOL8U851/vHuJvq7ga+BJOfxzwJLa8XynBPHCKAM583rvGG2AgMAce7vAIx33kQ1Ca0jUAx0qeP5tscmmOuBCGCOc7tD7RdlPeW4CMhxzhkBvAosO83+D9VRHmuB7k48O4Hb6nn8XKASmwDCgYecN8GTTvld5JR/vLP/YuBdoLVTnnuAec59S4H/df6nMcDZdb0x64ljFnAYGOeUfV/st59IYB/2AzgK+IETzwCX55+NrRnGACuBb4EbXJ7Pf2qVzdfY5J6ITQYbgFHO810JPODs2x/7YXChE8fPnViimlDOi4GfOX8vwFYmfuxy30+dv3/vxNcZ6IStDP3B5bVfCfzZiTUWmIpNWEOx768lDZU1cAnQxynj87Cv49G1jv975/le7Nzfzrl/GTYZxznnO4xnif4ZnA8abKIvdc4ZDvwR+LqBYzaU6Bt6Do9hP6zaY1/D7wF/9EmO9MVJmv1J2Bd9Ibbmk8v3NZSezj+lt8u+v8Cpjbhs+whb005x/lFxLvctwf1EvxOY7HJfN2ytN8IlliSX+9cCs52/dwMz6nl+O4ELnb9vB1bUs9/1wNpa29bUvNBxL9E/73L7YmDXafavK9Ff53L7/wHP1PP4ucBel9vDnDLq4rLtBDDSeQOW4dREnftuBVY5fy/GJrGkOs5zukT/EXBXHdvPwdZew1y2LQUedHn+z7ncdwews9bzya1VNte63H4LeLrW42teu78BXne5Lwyb3CY1oZzn8X3tdSe2Fr/MuZ3G98l2P3Cxy+OmAAddXvvluHxrABYCf3K53f90ZV0rrn/WlLtz/BIgwuX+TOy373Ds+2igy33/h2eJPg1Idv5+EPi3y32DgZIGjtlQoq/vOQj2g7uPy31nAt+6U1ae/oRSG/1MY0xb52dmrfvSXf7uAcwSkdyaH+BsbFLuDpw0xhS57J/WiBh6AO+4HHcnUIWtwdU45vJ3MRDv/J2MfaPV5SXsV26c3y/Xs1/3OuJNw9Ye3VVnfCLyK5eO7meacox6HHf5uwTAGFN7Wzz2m0wUpz4/1+f2c+ybaa2IbBeRm08To6v6yr47kG6Mqa7nnHXFX1fsrtzd/5T/pRNDeq1zu1vOnwHniEhXbNJ8DThLRHpim8M21XVO52/XQQ1ZxphSl9vdOfW91eB7RUSmicjXIpLjvD8uxv5fa5wwxlTW8Zw6YStLbp/rNHEMA/KNMa7Hq12WMU3sh2joObQC1rvkhw+d7c0ulBJ9Q4zL3+nYGn1bl584Y8yfsG1q7ZyOmhopLn8XYf9ZAIhIOKf+o9KBabWOHWOMOexGjOnYr7V1eQWYISIjsG3G/6xnvyPYDxtXKdiaoEeMMf9nvu/ovq1ms6fHbYRsbK3O9fl999yMMceMMbcYY7pja/pPNWKkTX1lfwRIFhHX94lXytMNp/wvRUSwH0iNPrcxZh824dyJ7acqwCa2+dhacc0HWe3XT4qz7btD1Tr0UScm1/3rJCLR2G8wf8V+Y2sLrMB+OJ9OFvabtlvncsPF2H4oX8rGfpAPcckNCcaYhipBXtNSEr2rV4DpIjJFRMJFJEZEJolIkjEmDUgFficiUSJyNjDd5bF7sJ/0l4hIJLYzMNrl/meAh0WkB4CIdBKRGW7G9TzwBxHpJ9ZwEekAYIzJwHasvgy8ZYwpqecYK4D+InKNiESIyNXYr6HvuxlDYx0HejfTsU9hjKnCttE+LCKtnTK+B/v/RERmiUiSs/tJbFKqcjPO54F7RWSMU/Z9neN/g/1w/7mIRIrIJOzrYZmXn15dXgcuEZHJzmvtZ9imq6+aeLzPsM1+NaNMVtW6DbZZ6tfO67Yj8Fuc8m0gxrkiMlhEWgEPNLBvFPa9kgVUisg0bB/MaTn/+7eBB0WklYgMxja11st5/8ZgP0ginfd5Tb67BPte8Rnnw/Q54FER6ezEmCgiU3xx/haX6J2vazOwHWxZ2NrcfXxfFtcAE7Cdkg9g235rHpuHHbL1PLZmVYTtZa/xOLaz5WMRKcB2bE1wM7RHsG+cj7EjC17AdnjVeAnb5ltfsw3GmBPApdikcALbnHGpMSbbzRga6wVgsPNVtL5vGd50B7bMD2DbXpdg24nBdqR+IyKF2P/BXcaYb537HgRecuK8qvZBjTFvAA87xyvAfmNqb4wpx47OmIatkT0F3GCM2dU8T++UmHZjm+n+7px7OnYIcXkTD/kZtgNwdT23wXYep2JH8mzFdhTXe52EMeYDbAfjSmxH8coG9i3AfqN4HftBfA32/+Su27FNIMew7eIvnmb/j7E16InYvpsS4FwRScB+K27qB6YnfoEtp69FJB87ymqAL04sTqeAqoeIPIjteLnudPs2cxznYmtXPWu1GSul3OR80F9pjPmvD/xQ1uJq9MHI+ep+F3ZEjCZ5pZouF3vdRosS6Fe3tXgiMgj7dXozdry5UqqJjDEf+zsGf9CmG6WUCnHadKOUUiHO6003zhCmPwBtgFRjzEvePodSSin3uZXoRWQhdthepjFmqMv2qdghheHYjsI/YYcuJmKHJ2bUcbj/0rFjR9OzZ8/GRa6UUi3c+vXrs40xp7261t0a/SLsLI7fjSl3rgp9EjvpUgawTkSWY8eFrjHGPCsibwKfnu7gPXv2JDU11c1QlFJKAYiIW1NBuNVGb4xZja2huxoP7DPGHHAu4liGrc1nYC+IgO+vTFRKKeUnnnTGJnLqJEMZzra3gSki8ndOveruFCIyX0RSRSQ1KyvLgzCUUko1xJPO2LomIzLGmGLstKgNMsYsEJGjwPSoqKgxHsShlFKqAZ7U6DM4dTa5JE6d6e60jDHvGWPmJyQkeBCGUkqphniS6NcB/USkl4hEAbNp3CRFiMh0EVmQl5fnQRhKKaUa4laiF5Gl2JWKBohIhojMcybXvx27Os9O7Go42xtzcq3RK6VU83Orjd4YM6ee7SvwYF5nEZkOTO/b1931IZRSSjVWQMx1063vEHPzX5q2lkP7uCiuHJNEjw5xp99ZKaVCiIisN8aMPd1+fp29sqZGH9OtLx9tP3ba/euSW1zBP/6zj0n9O3HDxJ6c168TYWHurE6mlFItQ0DU6MeOHWuaemVsZn4pS9Ye4tVvDpFVUEbPDq247owezBqbTEJspJcjVUqpwOFujT7oE32N8spqPtp+jMVrDrLu4EliI8OZOSqRGyf2YGDXNt4JVCmlAkhQJHqXzthb9u7d67Xjbj+Sx8tr0vjnpsOUVlQzoVd7bpzYkwsHdyEyXGdmVkqFhqBI9DW8UaOvS25xOa+npvPy12mk55TQtU0M105IYfb4FDq1jvb6+ZRSypc00buoqjas2p3JS2vSWL0ni8hw4ZJh3bhxYk9GJrdFRDtvlVLBJygSfXM13TTkQFYhL3+dxpupGRSUVTI8KYEbzuzJpcO7ERMZ7pMYlFLKG4Ii0ddo7hp9XQrLKnln42EWf3WQvZmFtGsVyezxKVw7IYWkdq18GotSSjVFcCX6MaNM6pefNe3BETEQEdXkcxtjWHPgBIu/SuPjHXYs/wWDunDjxJ5M7NNBm3WUUgEruBJ993CTOj++aQ8Oi4Duo6Hn2fYneQJEN+1Yh3NLWPJNGkvXppNTVE7fzvHMHpdMcvtWtImJJCE2kjaxESTERhIXFaEXZiml/Cq4Ev3AFJO68OdNe3BRJqR9BYc3gKlyEv8o6HEW9DwHUiZAdOtGHbK0oooVW4/y0lcH2ZxR98yaYQKtayX/NjH2J6FVJG1inG0122MjSYiN+O629gcopTwVFIneq52xZYWQ/g0c/ALSvoTD66G6EiQcuo90SfxnQIz7F1AdyS0hp6ic/NIK8ksqyC+pJL+0grwSezuvpIL80kqXv+3v0orqBo8bFRFGQqz9oOiWEENSu1YktYslub3zu10rOsZHadORUqpeQZHoazRLZ2x5EaSv/T7xZ6RCdQVIGHQbcWrij23r3XMDZZVVFJRWNviBkF9SQW5xBUdyS8g4WcKJovJTjhETGUb3hFhax9pvCK1jImgdHWl/x0QSFRFGeBiEh4URESaEhQkRYUK4COFhQkS4ECZCq6hw4qMjiHceHx8TQXx0BFERevGYUsFME31t5cWQsc4m/YNf2L+rygGBbsOhh9PG3+NMiG3XvLHUo7i8koyTJWScLCY9x/4+kldKfkkFBaWVFJTW/K6kpMLzddejIsJo3yqKX148kBkjE73wDJRSvqSJ/nQqSmwtvybxp6+FqjJAoOtQW9vvcRb0mAit2vs2NjdUVlVTWW2orDZUVRmqjKGyuprqaqisrqaq2lDl3F9SXkVhmf2AKCyrpLC0wt4uq+SbAzlsSs/l3ov68z/n99WmIqWCSFBMU+xXkbHQ6xz7A1BRatv1076Eg59D6kL4+ilAIHk89J8KA6ZBp4EQAMkwIjyMCC/055ZVVnH/W1v568d7OJRTzMOXD9P5gJQKMS23Rn86lWV2JM+BVbDnQzi6yW5v28Mm/P5TbY3fgzH8gcIYw6Of7OGJlfvo0aEVM0YmMnNkd3p3auKQV6WUTwRF040/pkBosvwjsOcjm/QPrILKUohuA31+YBN/v4sCsomnMT7cZqd5XnPgBMZA745xDE9KYFhSWy4e1pVuCbH+DlEp5SIoEn2NgKzRN6S82Knpf2CTf+FxO5onecL3TTwd+wdEE09THMsr5b3NR1h7MIetGXkcyy8lMlz44egk5p3di14d44jQ5h2l/E4Tva9UV8PRjbD7Q5v4j22129v1gkHTYfBMSBwdtEkfIO1EEc9//i2vpaZTXllNmECXNjF0bxtLt4QYLhjUhRkju2tHrlI+poneX/IybPPOrhXw7Wf2oq2EZBg8w/4kjoWw4KwNZ+aX8umuTI7klnAkt5QjuSUcyinmcG4JM0Z25+HLhxEf3XL795XyNU30gaDkJOz+AHa8C/tX2nH7bRJh0GU26SdPCNqkX6Oq2vD0qn088skeuraJ4e4L+nPF6ERt2lHKBzTRB5rSPNu8s+Nd2PdvO2Y/visMvgyG/hCSxgd10l+flsPv39vB5ow8YiLDaNcqioTYSNq2iiQqIpzoiDD6dY6nb+d4OreOoXObaDq3jiYhNlKbfJRqIr8lehGZBPwB2A4sM8asOt1jWkSid1VWYDtxd7wLez+ByhJISIFhP4Rhs6DLEH9H2CTGGD7ZcZxvvs0hz5neIa+knPIqQ3FZJQeyi6iqPvX1FhUeRqfW0Qzu3oYzencgqV0skwZ0ItobFwkoFeK8muhFZCFwKZBpjBnqsn0q8DgQDjxvjPmTiJwH3A8cBx4yxuw73fFbXKJ3VVYIu1fA1jdg36d2Bs7Og2HYlTD0SmjXw98Rek1pRRVH80rJzC8ls6DM+SklM7+Mtd/mcDi3BIDuCTEMTUwgJjKcbm1juHRYd4YlJfg5eqUCj7cT/blAIbC4JtGLSDiwB7gQyADWAXOAXcaYahHpAjxijLn2dMdv0YneVVE2bH8Htr4J6V/bbckTbC1/8EyI7+Tf+JqRMYaconK2HM7jxS8PkplfSmlFFYdzSxCEZbeewegU/8xBpFSg8nrTjYj0BN53SfRnAg8aY6Y4t38JYIz5o3M7ClhijLnydMfWRF+Hk2mw7S2b9DO32+mW+5xvk/7ASxo9x36wOlFYxsynvqSgtJJfTRvElWOSdMEXpRy+SPRXAlONMT9ybl8PTABWAlOAtsDT9bXRi8h8YD5ASkrKmLS0NLfiaJGOb7cJf+ubkHcIImJhwFQYPhv6XgDhoT2k8dvsIu57YzOpaSfp3TGOLm1iiIsOJzYqgriocHp2jKNPp3gm9ulAnA7vVC2ILyY1q6taZYwxbwNvn+7BxpgFInIUmB4VFTXGgzhCX5ch9mfyb+0sm1vfgO1v22aeuM4w/CoYeU3QduKeTq+Ocbxx25m8veEw7205QlFZJUfzKigut3P+ZxeWARAbGU5y+1gGdm1DQmwkXRNiOK9/J4Z0b6Mje1SL1mxNN42hTTdNUFVhR+xsetWO4KmugK7DYeS1tnknroO/I/SZ7MIy9hwr4OMdx/k2u4i0E0XklVRwsrgCgM6toxnYzSb/Xh3jiIkMIzoinNEpbRml7f4qiPmi6SYC2xk7GTiM7Yy9xhizvRFBBs+kZoGs6ARse9Mm/aOb7bq5/afaWn7fC0Nihs2myC4sY9XuLFbtziQ9p5isgjKO5JV+d78ITBnclalDu9KlTQzJ7WPpnhCrfQAqaHh71M1SYBLQETts8gFjzAsicjHwGHZ45UJjzMNNCVZr9F50fDtsWgJbXrcLp7fqAMOugpFzbI2/hTdhVFcbyquqOVlczrOfHeDN9RkUllV+d39MZBjd28YyoEtrbpzYkzN6t5xvRir4BMWVsVqjb0ZVlbD/U5v0d6+w0y90HmJr+cOvgvjO/o4wIJSUV3E4t5hjeWUcyinmQFYhR/JKWPvtSbILy7hgUGeGJibQIT6alPatOKN3e72YSwWMoEj0NbRG38yKc2zn7aYldhWtsAg7lfLoG+18+mGauGorLKvk75/u5bXUdHKdtn6A+OgIzh/YmQFd4hmd0o4zenfQph7lN0GR6LVG7wdZu2Hjy7BpKRRn20nWRl1nO3FD6Cpcb6pwmnp2HMnnw23HWLkrk8wCO9KnVVQ4g7u1YUj3NvTv2ppxPdvTv0vLuMZB+V9QJPoaWqP3g8pyO3/+hsV26gWA3pNg9A32gqyIaH9GF/AKSiv4cNsxth/JZ+vhPHYdzaeovAqAkcltObd/J4YnJjChd3tax0T6OVoVqjTRK/flptsROxtfgbx0iG0PI+bA6Ouh8yB/RxcUjDEcPFHMOxsPs3pPFpszcjEGOsRFcenwbvTqGMdFQ7rSva0ux6i8JygSvTbdBJjqKjjwH1vL37XCjs1PGm9r+UMuh2hdLNxdBaUVbE7P45FPdrP7WAFF5VXERIYxrmd7RiW3ZeaoRHp1jNMLuZRHgiLR19AafQAqzIIty2zSz94DUfF23vzRNwb90oi+Zoxh17ECXvk6jY2Hctl1LJ9qYy/kumpsMjNHdadvZ23XV42niV55hzF22oUNi+3InYpiO43y6Btg+NXQqr2/Iww6R3JL+PfO46zancXKXZkAJLeP5aLBXTm7X0fO6tORqIjgXYRG+U5QJHptugkypfl2Rs0Ni+HIBgiPtk06Y2+y0ylrLb/RMvNLWbH1KCt3Z/H53iyMgeiIMGaNTeK+KQNJiNWOXFW/oEj0NbRGH4SObYXUF+0VuOUFtpY/5iYYcTXE6CIhTVFQWsE3B3JYtu4Q/96ZSUSYMG1YN87o3Z7JA7vQNSHG3yGqAKOJXvlGWaGdZyf1RTi6CSJbwdArYOzN0F3b8pvCGMOGQyd5e8Nhlm86QkFZJVERYZzZuwMT+3RgdI92jExuS6QuwN7iaaJXvnd4A6x/0c6bX1Fs59YZe5OdTbOFLJTibcYYdh4t4PXUdL7Yl82+zELATsncp3Mc88/tw5QhXXRahhZKE73yn9I826SzfhEc32ZH7AybZZN+txH+ji6oZRWUsT4th9V7s1m5M5Nj+aXER0cwfUR3/uf8PiS1a+XvEJUPBUWi187YEGcMZKRC6kI7YqeyFBLH2Lb8oVdAVJy/Iwxq1dWGz/Zm8f7moyzffJjKasPkgV2YNTaJ8/p3IiZSa/mhLigSfQ2t0bcAJSdh82s26WfvhugE23E75iboMtjf0QW9g9lFLPj8AMs3HaGwrJK4qHCuHJPEvVMG6BQMIUwTvQpMxsChNTbh73jXTp+cfIZt1hk8AyJ1igBPlFZUsT7tJP/ceJi3NmTQPi6K2eNSuOXc3jpUMwRpoleBr+gEbF5iR+zk7IfYdnYWzbE3Q4c+/o4u6G08dJLHP93Lqt1ZdIyP4pZzenPlmCQ6xOuEdaFCE70KHsbAt6sh9QXY+T6YKugzGcb9CPpP0fnyPfTNgRP8+cNdbDiUS2S4MGVIV+ZO7MnYnnpVc7DTRK+CU/5R2PCSHbFTcBQSkmHMXDvlgq6K1WQ1wzTfXJ/Bm+vTyS+t5MHpg5k9PkU7bYNYUCR6HXWj6lVVAbs/gHXPw7efQVikbcMfNw9SztQLsTxQUFrB/yzZyOo9WbSOieCa8SnMGptM3846O2mwCYpEX0Nr9KpB2Xtt5+3GV6Esz063MG6enVRNL8Rqkupqw5oDJ3h5TRqf7DxOVbVhQq/23DW5H2f26aDTJwcJTfQq9JQX2UnV1j4Hx7bYC7FGzIax83SIpgeO55eyfNMRnlq1j5PFFQzs2pqfTx3A+QM6a8IPcJroVegyxi5yvu552PY2VJVBj7NsLX/gdIiI8neEQam0oor3txzliU/3ciinmJHJbbnrgn6c16+TLoAeoDTRq5ah6ARsesU27Zw8CHGdbcftmLnQNtnf0QWliqpqXk9N5++f7uNYfikjkhL466wR9NNFzwOOJnrVslRXw/6Vtpa/50PbWdt/mq3l9z4fwnSmx8Yqq6xi+aYj/O69HRSWVTJtaFd+felgEnXd24Dh10QvInHAauABY8z7p9tfE73yqpNpdnjmhsVQnA3te9t2/JHX6IpYTXA0r4RFXx7k2dUHiAgT5k7syS+mDdRpkgOAVxO9iCwELgUyjTFDXbZPBR4HwoHnjTF/crb/HigCtmuiV35TWQY737O1/ENrICLGrns7bp6dXE01StqJIp5etZ9l69Lp1TGOey7sz/QR3f0dVovm7UR/LlAILK5J9CISDuwBLgQygHXAHKA70BGIAbI10auAcGybvfJ282tQUQTdR8G4W+wsmjq/TqP8e8dxHv33HrYfyecHAzvzpx8Oo3NrXf3KH7zedCMiPYH3XRL9mcCDxpgpzu1fOrvGA3HAYKAEuNwYU93QsTXRK58pzYctr9laftYuO7/O6Bts0067Hv6OLmhUVFXz4pff8pePdtMmJpJZY5O59dzetIvTEU++5ItEfyUw1RjzI+f29cAEY8ztzu25NFCjF5H5wHyAlJSUMWlpaW7FoZRXGAMHP7dj8nf9C0w19J8K42/RzttG2HjoJI98socv9mXTKT6aW8/rw3VnpOiKVz7ii0Q/C5hSK9GPN8bc0YggdQoE5X95h+0SiOsXQVEWtO9jJ1QbeQ3EtvV3dEFhfdpJ/vzBLtYezKF3xzh+efEgLhikF1w1N7813Rhj/tjYYLXpRgWEyjLYsRzWLoCMtXah8+FX21p+lyH+ji4orNqdye/f28GB7CLO7N2B388YouPvm5EvEn0EtjN2MnAY2xl7jTFmeyOC1Bq9CkxHNsG65+xC55WlzpW3P4JB0yFcF/BoSE37/dOr9lNRZfjDzCFMH96dCB2O6XXeHnWzFJiEHU1zHDs+/gURuRh4DDu8cqEx5uGmBKs1ehWwinNg4yu28zY3DeK72tWwxsyF1l39HV1AO5Jbwo9eSmXH0XzO69+JJ+aM0lWuvCworozVGr0KGtVVsO/ftvN23ycQFgGDLoPx8yHlDJ02uR5V1YZnPtvPo5/sIbFdLE9fO4bB3dv4O6yQERSJvobW6FVQObHfmTb5ZSjNgy5DbTv+sFkQFefv6ALS+rQcfvLqBnKLK3j48mFcOSbJ3yGFhKBI9FqjV0GtvAi2vgFrn4fjWyE6AUZdZ6+81TVv/0t2YRl3LNnImgMnuGZCCr+9dLCubuWhoEj0NbRGr4KaMZD+jR2ts+NdqK6EvhfYK2/7Xahr3rqorKrmrx/v4ZnP9tOzQyv+dtVIxvRo5++wgpYmeqX8oeAYrH/JNu0UHoO2PWwNf9T1OqGaiy/2ZvOLt7aQWVDKwzOHMWtsko65b4KgSPTadKNCVlUF7Hrfdt6mfelMqHalbcvvPtLf0QWEvOIK5r+cyjff5jB7XDK/mzFEr6htpKBI9DW0Rq9C2vHtNuFveQ0qiiFpnB2tM3gGRET7Ozq/qqyq5nfv7eDlr9Phf4cjAAAXQ0lEQVQY26MdL940jtYxOgTTXZrolQo0JbmwealN+jn7Ia4TjL7RjstPaNmjUJZvPsLdyzZyVt+OPHXtaE32bgqKRK9NN6pFqq6GA/+xF2Ht/gAkDAZebGv5Pc9psWPyX09N5/63tnBW3468OHecXknrhqBI9DW0Rq9arJNptuN2w2IoyYFOA+1UCyNmQ3TLmyNm6dpD/PLtrdx8Vi9+O32wv8MJeO4mev3IVMqf2vWAC38H9+yAmU/bRVBW3At/GwT/uheydvs7Qp+aMz6FuRN7svDLb3n2s/3+DidkRPg7AKUUNsGPvMb+ZKy3Y/I3vGQnVut1Hky41c6X3wLG5P/20sFkF5bxxw920bdzPJMHdfF3SEFPm26UClSFWTbZpy6E/MOQkALjbrYduCE+Jr+0ooppj39OdkEZC24Yy5l9Ovg7pIAUFG302hmrlBuqKmH3ClvLP/j592PyJ8yHbiP8HV2zOXSimHkvreNIbglPXjuaSQM6+zukgBMUib6G1uiVctPxHbY5Z/MyOyY/eYIdrTPoMogIvfVaj+aVcNOL69ibWcgjV41gxshEf4cUUDTRKxXKSnJh0xKb9HMOQHwXGOPMk9+mm7+j86rCskpuXLiWTem5/G3WCGaO0mRfQxO9Ui1BdTXs/9Q26+z92M6TP3iGreUnTwiZMfnF5TbZr087yWOzR3HZiO7+DikgaKJXqqU5sR/WvWBXxCrLg67DbcIfdqUd1RPkCssque75b9h+JI/lt5/NoG66gIkmeqVaqvIiO6/O2ucgcwfEtoPRN8DYeXbcfhA7UVjGlMc+Bwzv33EOXRNi/B2SXwXFBVMiMl1EFuTl5fkzDKVCS1QcjL0ZfvwV3Pi+nVbhq3/A4yNg6RzY/x87h34Q6hAfzZJbJlBUVsVPX9tERVW1v0MKClqjV6olyMuw4/HXL4LiE9Cxv23WCdKpFl5bd4hfvLWVy0cl8rdZIwgLC42+iMYKihq9UspHEpJg8m/hpzvg8mchKv77qRZW3AfZwXUdy9XjUrj7gn68s/Ewf/24ZU0T0RQ6BYJSLUlkjK3Fj5jtTLXwrK3lr10Avc+3tfz+U4JiqoW7JvfjeH4pT63aT/e2sVx3RnD3PzQnbbpRqqUrzHTm1VkIBUegbYqdQTMIlj+srKpm/svr+c/uTJ69bgwXDenq75B8SkfdKKUap6oCdv3L1u5rlj8cNsvW8rsN93d09Sosq+Ta575mb2Yhz14/hnP6dfJ3SD7jt0QvIoOAu4COwKfGmKdP9xhN9EoFmGPbnKkWXoPKEkg50653O+gyCA+81Z8y80u5/oW1HDxRxCc/PY+UDq38HZJPeLUzVkQWikimiGyrtX2qiOwWkX0icj+AMWanMeY24CrgtAEopQJQ16Ew/XH42U646GEoOApv3gyPDoVVf4aC4/6O8BSd28Sw4IYxVBvDvW9splKHXZ7C3VE3i4CprhtEJBx4EpgGDAbmiMhg577LgC+AT70WqVLK92LbwcTb4Y6NcM3r9gNg1f/Bo0PgrR9B+tqAGZPfo0Mcf7piOGsP5vDQv3b6O5yA4laiN8asBnJqbR4P7DPGHDDGlAPLgBnO/suNMROBa70ZrFLKT8LC7Gic696C29fbzto9H8ELF8KC82Djq1BR4u8ouWJ0InPGp7Doq4O8vOagv8MJGJ6Mo08E0l1uZwCJIjJJRJ4QkWeBFfU9WETmi0iqiKRmZWV5EIZSyqc69oVpf4J7dsIlf4PKMnj3J/DIYPjkAcg95LfQRISHZg7l7L4d+d17O1h3sHb9tGXyJNHXdSmaMcasMsbcaYy51RjzZH0PNsYsAH4HbIiKCr15tJUKedHxtmb/k6/hxvegx0T46gk71cKya+HAZ35p1gkPEx6bPZLk9q24fckGcovLfR5DoPEk0WcAyS63k4AjjTmAMeY9Y8z8hIQED8JQSvmVCPQ6F2a/CndtgbPugrSvYPFl8OQEO7laWaFPQ+oYH80Ts0eRVVDGI5/s8em5A5EniX4d0E9EeolIFDAbWN6YA+ikZkqFmLbJcMGDtlln5tN2euQV98Ijg+CDX0D2Pp+FMiwpgevP6MErX6exKT3XZ+cNRG6NoxeRpcAk7Nj448ADxpgXRORi4DEgHFhojHm4KUHoOHqlQpQxkJFqp1rY/k+oroA+k+1FWP0ubPapFvJKKpjy6GoiwoXlt59N+7jQaiYOiitjdXFwpVqQguN2qoXUhXZcfruezlQL19lhnM1kU3ouVz27hjEp7Vg8bzyR4aEzl2NQJPoaWqNXqgWpqoCd79m2+0NfQUQsDL/K1vK7Dm2WU761PoOfvbGZOyf3454L+zfLOfwhKKYp1jZ6pVqg8EgYegXc/AHc+jkMnwVbXodnzoKF02D7O/bDwIt+OCaJK0Yn8veVe/lkR2Bd1esLWqNXSvlfcY5d63bdc3YcfuvudpWsMTdCfGevnKKkvIrLn/qS7MJyPrjrHDq1jvbKcf0pKGr0SikF2OmQz7oT7twEc5ZB54Hwn4fsRVhv3WI7dD2slMZGhfPo1SPJKynnJ6+up6ra/5VcX9GmG6VU4AgLhwHT4Pp34PZUW6vf/QE8PxmeOx82LYWK0iYfflC3Njw0cyjrDp7k/324y4uBBzZtulFKBbayAti8zM6Tn70HWnWA0TfCuHl2icRGMsbwq3e2sXTtId647UzG9QzsxVUaoqNulFKhxRj49jP4ZgHs+cBuG3iJHa3T8xx7ha6bissrOe8vq0hsG8s7P5mINOKxgSQo2ui16UYp5TYR6D0J5iyxbfkT74SDX8BL0+GpM2HdC25PtdAqKoJ7LuzPpvRcVu0O/UkVtUavlApeFSWw7S345lk4tgWiE2DUtfZCrA59GnxoeWU1kx9ZRWR4GB/dfW5QXkgVFDV6pZTySGSsvbL21tVw88d2WoW1C+Dvo+GVK2HPx1Bd92pTURFhPHDpEA5kFbHoy4O+jdvHNNErpYKfCKRMgCtfgJ9uh0m/tDX8JbNs0l/zJJT898Rmkwd15rz+nXj033vILizzQ+C+oYleKRVaWneFSffD3dvghy9AfBf46Fd2Bs337obj27/bVUT49SWDKKmo4rnVB/wYdPPSzlilVGiKiIJhV8K8j2D+Z3bahc1L4emJsOhS2PEuVFXSr0trLhvRnRe/OsjhXP8vh9gctDNWKdVyFOfAhsV2hE7eIWiTCGNv4mjfqznnH9uYNTaJP14x3N9Ruk07Y5VSqrZW7eHsu+GuTTB7KXTsBysfotsLY3i9y0tsX7eK9Jxif0fpdVqjV0q1bFm7Ye1zVG9aQlhFERlxQ0i66C4YMhMiAnviM63RK6WUOzoNgEv+StjPdvFax9upKMyBd+bDo0Ng5UOQ36ilsAOSJnqllAKIacPYq3/JD8r+wluDHofEMbD6r/DYMHhjrl3wPABaQJpCR90opZSjT6d4fjCwKw/s6Ebe5a/AnRtgwm2wfyW8OA2eOQfWvwTlwdWO79dEb4x5zxgzPyEhwZ9hKKXUd35yfl8KyypZ8s0haN8bpjwM9+yE6Y+DqYb37rRj8j/+NZw86O9w3aJNN0op5WJMj3ac2bsDL375LRVVzvQJUXEwZi78+EuYuwJ6nwdrnoLHR8KS2bbGH8DNOprolVKqlvnn9SazoIzFa9JOvUMEep4FVy2Gu7fCOT+DjHXw8uXwj3F2CuWyAv8E3QBN9EopVcuk/p04o3d7/r5yL3kl9SxUnpAIk38D9+yAy5+F6NbwwX3wt0Gw4j7I3uvboBugiV4ppWoREX4xdSB5JRXMfXFtwztHRMOI2TD/P/CjlTDwYli/CP4xFhbPtEshVlf5JO76aKJXSqk6jEppx6T+ndh4KJeNh06696CkMXDFAjuD5vm/thdjLZ0NT4yCL5+wUzD4QbMkehGZKSLPici7InJRc5xDKaWa20OXDwPgjysauZB4fGc47z64ewvMWmTXtv3kN/DIYFh+Bxzb6v1gG+B2oheRhSKSKSLbam2fKiK7RWSfiNwPYIz5pzHmFmAucLVXI1ZKKR9JbBvL0MQ2rD2YQ8bJJoydD4+EIZfDTSvgti9g+FWw5Q145mxYOA22vQ1V9fQBeFFjavSLgKmuG0QkHHgSmAYMBuaIyGCXXX7t3K+UUkHpd5cNBeDuZZs8O1DXYXDZE7bz9qKHIP8wvHkTpC70QpQNczvRG2NWA7UbmMYD+4wxB4wx5cAyYIZYfwY+MMZsqOt4IjJfRFJFJDUrK/QX51VKBacxPdrRo0MrUtNOciyv1PMDtmoPE++AOzfCnNdg2CzPj3kanrbRJwLpLrcznG13ABcAV4rIbXU90BizwBgz1hgztlOnTh6GoZRSzefB6UMAeOhfO7x30LBwGDDVJv5m5mmilzq2GWPME8aYMcaY24wxz9T7YJ3rRikVBM4f2Jm+neN5f8vRoJyv3tNEnwEku9xOAoJ/Tk+llKrlzz+0K0/96h3fjpjxBk8T/Tqgn4j0EpEoYDaw3N0H66RmSqlgMaZHO9rERPD53mxOFpX7O5xGaczwyqXAGmCAiGSIyDxjTCVwO/ARsBN43RizvaHj1DqmNt0opYJGzXqyP39ri58jaRxdSlAppRqh3/+uoKLKsO13U4iPjvBrLLqUoFJKNYOHZtpx9U+v2ufnSNynK0wppVQjzBpjx5889/m3fo7EfbrClFJKNUJYmHDBoM6UV1az7XBwVFK1Rq+UUo30i6kDAfjHyuBovtEavVJKNVK/Lq0B+HD7MT9H4h7tjFVKqSaYO7EnAB9sPerfQNygiV4ppZrgzsn9APjrx7v9HMnpaRu9Uko1Qfu4KPp3iWd/VlHT5qr3IW2jV0qpJvrNpXb5jUc+2ePnSBqmTTdKKdVE5/SzU6y/veEwgTDLQH000SullAeuGJUIwAZ3FxD3A22jV0opD9x6Xh8A/ufVjX6OpH7aRq+UUh4Y0LU1Se1iOZZfyoGsQn+HUydtulFKKQ89ctVIAP7yUWAOtdREr5RSHhrfqz0i8MG2Y1RXB16nrCZ6pZTygmsnpACwfHPgraaqiV4ppbzg3osGAPDiVwf9G0gddNSNUkp5QdtWUXRLiGFzeq6/Q/kvOupGKaW8ZPKgzgBsyQisZK9NN0op5SVXOqtPvbYu3c+RnEoTvVJKecmwRNs68f6WwJq6WBO9Ukp5SXiYcF7/TuSVVFBaUeXvcL6jiV4ppbzo3P52orNVu7P8HMn3NNErpZQXTRnSBYA312f4OZLveT3Ri0hvEXlBRN709rGVUirQJbVrBcDqvUFWoxeRhSKSKSLbam2fKiK7RWSfiNwPYIw5YIyZ1xzBKqVUMJg7sSflldVsDJCpi92t0S8CprpuEJFw4ElgGjAYmCMig70anVJKBaEZI7sD8O6mwJgOwa1Eb4xZDeTU2jwe2OfU4MuBZcAMd08sIvNFJFVEUrOyAucrjlJKeWqoM8xyf4BMW+xJG30i4HpVQAaQKCIdROQZYJSI/LK+BxtjFhhjxhpjxnbq1MmDMJRSKrBEhocxNLENn+/NDohhlp4keqljmzHGnDDG3GaM6WOM+WODB9C5bpRSIeqsvh0B2Hk038+ReJboM4Bkl9tJQGA0SCmllJ9N6m/nvVnyzSE/R+JZol8H9BORXiISBcwGljfmADqpmVIqVJ3ZpwOdWkfz2R7/90G6O7xyKbAGGCAiGSIyzxhTCdwOfATsBF43xmxvzMm16UYpFcpiI8PJLCgju7DMr3G4O+pmjjGmmzEm0hiTZIx5wdm+whjT32mPf7ixJ9cavVIqlN01uR8AJwrL/RqHToGglFLNpEN8FAAPLN92mj2bl64wpZRSzeSsvh2JDBfKK6v9GoeuMKWUUs0kMjyMSQM6s+FQLvmlFX6LQ2v0SinVjDq3jgbgw63H/BaD1uiVUqoZ3TdlAACFZZV+i0E7Y5VSqhnFRIYDcDi3xG8xaKJXSqlmFB1h0+wLX3zLoRPFfolB2+iVUqoZiQh3OuPpswpL/RKDttErpVQzO7N3BwDKK41fzq9NN0op1cyiIuxkv+VV/hlPr4leKaWaWXSE7ZC9743NbPDD8oLaRq+UUs2sf5fWXH9GDzILyth+2Pf5TtvolVKqmUVFhHHPhf0BqKjyfTu9Nt0opZQPRITbdvqqak30SikVkiLDbbqtqPZ9h6wmeqWU8oGIMFujr9SmG6WUCk3hYYIILFh9gB+/st6n59ZRN0op5QMiwv9ePIjubWNY7eN1ZHXUjVJK+ciPzunNpAGdqTK+bb7RphullPIhEfD1wBtN9Eop5UPhIlT7ONNroldKKR8KDxNtulFKqVAWJoIxYHyY7DXRK6WUD4WH+f4K2QhvH1BE4oCngHJglTHmVW+fQymlgtV3id4Y7yfgerhVoxeRhSKSKSLbam2fKiK7RWSfiNzvbL4CeNMYcwtwmZfjVUqpoCY2z+PLZnp3m24WAVNdN4hIOPAkMA0YDMwRkcFAEpDu7FblnTCVUio0hDuZ/r43t3DPa5v45sCJZj+nW4neGLMayKm1eTywzxhzwBhTDiwDZgAZ2GTf4PFFZL6IpIpIalaWb68SU0opfxmZ3JbeHePYlH6SdWk5ZBeWN/s5PWkiSuT7mjvYBD8BeAL4h4hcArxX34ONMQuABQBjx471z0KKSinlYxN6d2DlvZN8ek5PEr3Usc0YY4qAm9w6gMh0YHrfvn09CEMppVRDPBlemQEku9xOAo54Fo5SSilv8yTRrwP6iUgvEYkCZgPLG3MAndRMKaWan7vDK5cCa4ABIpIhIvOMMZXA7cBHwE7gdWPM9sacXKcpVkqp5ie+vAy3PmPHjjWpqan+DkMppYKKiKw3xow93X46BYJSSoU4XWFKKaVCnK4wpZRSIc5Xc+rUqWYcPZAvInudzQmAaxW/9u3a2zoC2c0YZl3n9/ZjG9qvsfe5s6327UAtQ2+UX0P3a/m5t19jyq+u7foebr73cA83YrNzIgfSD7Cgodu1twGpvoynOR7b0H6Nvc+dbXXcDsgy9Eb5NXS/lp/3y8+d8qq9LVDLz1tl6Iv38Ol+ArEztva0CXVNo1Dv1ArNwJNzufvYhvZr7H3ubPNl+XlyPm+UX0P3a/m5t19jyq+u7foebtx9Xn8NBsTwSk+ISKpxY3iRqp+WoWe0/Dyj5df8ArFG31gL/B1ACNAy9IyWn2e0/JpZ0NfolVJKNSwUavRKKaUaoIleKaVCnCZ6pZQKcSGX6EUkTkReEpHnRORaf8cTbESkt4i8ICJv+juWYCUiM53X37sicpG/4wk2IjJIRJ4RkTdF5Mf+jicUBEWiF5GFIpIpIttqbZ8qIrtFZJ+I3O9svgJ40xhzC3CZz4MNQI0pP2PXAJ7nn0gDVyPL8J/O628ucLUfwg04jSy/ncaY24CrAB126QVBkeiBRcBU1w0iEg48CUwDBgNzRGQwdqWrmrVsq3wYYyBbhPvlp+q2iMaX4a+d+1Ujy09ELgO+AD71bZihKSgSvTFmNZBTa/N4YJ9TAy0HlgEzsEscJjn7BMXza26NLD9Vh8aUoVh/Bj4wxmzwdayBqLGvQWPMcmPMRECbX70gmBNhIt/X3MEm+ETgbeCHIvI0vr9UPZjUWX4i0kFEngFGicgv/RNa0KjvNXgHcAFwpYjc5o/AgkR9r8FJIvKEiDwLrPBPaKHFr7NXekjq2GaMMUXATb4OJgjVV34nAE1O7qmvDJ8AnvB1MEGovvJbBazybSihLZhr9BlAssvtJOCIn2IJRlp+ntMy9IyWn48Ec6JfB/QTkV4iEgXMBpb7OaZgouXnOS1Dz2j5+UhQJHoRWQqsAQaISIaIzDPGVAK3Ax8BO4HXjTHb/RlnoNLy85yWoWe0/PxLJzVTSqkQFxQ1eqWUUk2niV4ppUKcJnqllApxmuiVUirEaaJXSqkQp4leKaVCnCZ6pZQKcZrolVIqxGmiV0qpEPf/AQKMofICqH8oAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.xscale('log');plt.yscale('log');plt.title(\"Frequency of n-th most common word and 1/n line\")\n", + "plt.plot([c for(w,c) in all_character_counts.most_common()])\n", + "M = all_character_counts.most_common()[0][1]\n", + "plt.plot([M/i for i in range(1,len(all_character_counts))])" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def get_probability_from_counts(counts): # fast method\n", + " total_occurences = sum(counts.values())\n", + " def get_prob(char):\n", + " occurence = counts.get(char,0) # D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.\n", + " return occurence/total_occurences\n", + " return get_prob\n", + "\n", + "get_char_prob = get_probability_from_counts(all_character_counts)\n", + "\n", + "def get_char_prob_slow(char): # slow method\n", + " total_occurences = sum(all_character_counts.values())\n", + " return all_character_counts.get(char,0)/total_occurences\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "def get_running_time(func,arg,times):\n", + " start = time.time()\n", + " for _ in range(times):\n", + " func(arg)\n", + " print('elapsed time for {} runs of {} is {} seconds'.format(times,func.__name__,time.time()-start))" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "elapsed time for 10000 runs of get_prob is 0.0059528350830078125 seconds\n" + ] + } + ], + "source": [ + "get_running_time(get_char_prob,'神',10000)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "elapsed time for 10000 runs of get_char_prob_slow is 0.6741960048675537 seconds\n" + ] + } + ], + "source": [ + "get_running_time(get_char_prob_slow,'神',10000)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "from functools import reduce\n", + "from operator import mul\n", + "def prob_of_string(string):\n", + " return reduce(mul,[get_char_prob(char) for char in string])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.8149833542281e-36" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prob_of_string('这是一个比较常见测试用例')" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.6957089481504437e-37" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prob_of_string('这是一个比较罕见测试用例')" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.2745292803369746e-36\n", + "2.0995356460752042e-33\n" + ] + } + ], + "source": [ + "print(prob_of_string('广州有一个地方叫做沥窖'))\n", + "print(prob_of_string('杭州有一个地方叫做西湖'))" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "pair = \"\"\"前天晚上吃晚饭的时候\n", + "前天晚上吃早饭的时候\"\"\".split('\\n')\n", + "\n", + "pair2 = \"\"\"正是一个好看的小猫\n", + "真是一个好看的小猫\"\"\".split('\\n')\n", + "\n", + "pair3 = \"\"\"我无言以对,简直\n", + "我简直无言以对\"\"\".split('\\n')\n", + "\n", + "pairs = [pair, pair2, pair3]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "def get_probability_prefromance(language_model_func, pairs):\n", + " for (p1, p2) in pairs:\n", + " print('*'*18)\n", + " print('\\t\\t {} with probability {}'.format(p1, language_model_func(tokenize(p1)))) # tokenize去掉','这样的标点\n", + " print('\\t\\t {} with probability {}'.format(p2, language_model_func(tokenize(p2))))" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 1.2207058723774045e-31\n", + "\t\t 前天晚上吃早饭的时候 with probability 1.420433440421635e-31\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 3.2528612289150613e-25\n", + "\t\t 真是一个好看的小猫 with probability 1.0220793879946632e-25\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 3.7425390630342124e-22\n", + "\t\t 我简直无言以对 with probability 3.742539063034212e-22\n" + ] + } + ], + "source": [ + "get_probability_prefromance(prob_of_string, pairs) # 3个结果都不合理" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2-Gram" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$ Pr(w_ow_1w_2...w_n) = Pr(w_1 | w_0) \\cdot Pr(w_2 | w_1) ... \\cdot Pr(w_n | w_{n-1}) $$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$ Pr(w_1 | w_0) = \\frac{Pr(w_1 w_0)}{Pr(w_0)} $$" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "gram_length = 2\n", + "two_gram_counts = Counter([ALL_CHARACTER[i:i+gram_length] for i in range(len(ALL_CHARACTER)-gram_length)])" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('新华', 135490),\n", + " ('华社', 129104),\n", + " ('20', 123427),\n", + " ('nn', 118789),\n", + " ('01', 102583),\n", + " ('17', 81801),\n", + " ('n新', 78433),\n", + " ('中国', 77776),\n", + " ('外代', 74795),\n", + " ('7年', 59051),\n", + " ('记者', 56946),\n", + " ('二线', 55866),\n", + " ('5月', 55491),\n", + " ('代二', 55245),\n", + " ('4月', 51236),\n", + " ('日n', 48360),\n", + " ('月1', 47181),\n", + " ('照片', 46712),\n", + " ('月2', 45268),\n", + " ('社照', 45003),\n", + " ('日在', 39495),\n", + " ('国际', 38923),\n", + " ('发展', 36410),\n", + " ('00', 32399),\n", + " ('国家', 31742),\n", + " ('比赛', 29313),\n", + " ('社记', 27920),\n", + " ('北京', 27639),\n", + " ('美国', 27622),\n", + " ('企业', 27381),\n", + " ('体育', 27169),\n", + " ('赛中', 26877),\n", + " ('公司', 26427),\n", + " ('10', 25595),\n", + " ('工作', 25311),\n", + " ('经济', 25304),\n", + " ('16', 25070),\n", + " ('n当', 25000),\n", + " ('当日', 24291),\n", + " ('20', 24199),\n", + " ('合作', 23932),\n", + " ('举行', 23906),\n", + " ('进行', 23528),\n", + " ('年5', 23416),\n", + " ('n5', 23383),\n", + " ('n4', 23273),\n", + " ('市场', 22666),\n", + " ('足球', 22606),\n", + " ('一个', 22231),\n", + " ('年4', 21904),\n", + " ('世界', 20383),\n", + " ('n外', 20336),\n", + " ('表示', 20215),\n", + " ('1日', 20099),\n", + " ('服务', 20067),\n", + " ('6月', 19602),\n", + " ('政府', 19325),\n", + " ('选手', 18663),\n", + " ('n体', 18616),\n", + " ('建设', 18606),\n", + " ('代2', 18516),\n", + " ('片外', 18512),\n", + " ('联赛', 18165),\n", + " ('摄新', 17810),\n", + " ('项目', 17809),\n", + " ('我们', 17778),\n", + " ('一路', 17295),\n", + " ('一带', 16930),\n", + " ('社会', 16759),\n", + " ('日电', 16731),\n", + " ('带一', 16683),\n", + " ('技术', 16546),\n", + " ('问题', 16448),\n", + " ('文化', 16386),\n", + " ('通过', 16301),\n", + " ('活动', 16137),\n", + " ('决赛', 16108),\n", + " ('00', 15900),\n", + " ('研究', 15864),\n", + " ('12', 15836),\n", + " ('人民', 15693),\n", + " ('11', 15662),\n", + " ('01', 15505),\n", + " ('15', 15328),\n", + " ('目前', 15296),\n", + " ('5日', 14960),\n", + " ('n1', 14932),\n", + " ('3日', 14872),\n", + " ('全国', 14819),\n", + " ('人员', 14664),\n", + " ('投资', 14531),\n", + " ('2日', 14530),\n", + " ('重要', 14453),\n", + " ('中心', 14365),\n", + " ('产业', 14123),\n", + " ('19', 14112),\n", + " ('今年', 13990),\n", + " ('行的', 13810),\n", + " ('地区', 13807),\n", + " ('法国', 13792),\n", + " ('4日', 13749),\n", + " ('的一', 13732),\n", + " ('城市', 13482),\n", + " ('晋级', 13433),\n", + " ('大学', 13406),\n", + " ('成为', 13392),\n", + " ('9日', 13371),\n", + " ('0日', 13222),\n", + " ('安全', 13176),\n", + " ('6日', 13113),\n", + " ('球员', 13102),\n", + " ('30', 12922),\n", + " ('7日', 12919),\n", + " ('8日', 12891),\n", + " ('没有', 12685),\n", + " ('创新', 12627),\n", + " ('管理', 12621),\n", + " ('自己', 12592),\n", + " ('n2', 12575),\n", + " ('组织', 12481),\n", + " ('利亚', 12406),\n", + " ('13', 12396),\n", + " ('他们', 12384),\n", + " ('方面', 12243),\n", + " ('第一', 12231),\n", + " ('时间', 12157),\n", + " ('14', 11928),\n", + " ('可以', 11898),\n", + " ('月3', 11878),\n", + " ('电记', 11810),\n", + " ('这是', 11801),\n", + " ('新闻', 11645),\n", + " ('摄n', 11579),\n", + " ('18', 11553),\n", + " ('10', 11467),\n", + " ('数据', 11404),\n", + " ('上海', 11281),\n", + " ('媒体', 11080),\n", + " ('提供', 11046),\n", + " ('战胜', 11014),\n", + " ('罗斯', 11002),\n", + " ('产品', 10929),\n", + " ('科技', 10926),\n", + " ('学生', 10902),\n", + " ('已经', 10854),\n", + " ('发布', 10817),\n", + " ('总统', 10788),\n", + " ('同时', 10754),\n", + " ('相关', 10727),\n", + " ('认为', 10627),\n", + " ('实现', 10625),\n", + " ('全球', 10570),\n", + " ('队球', 10498),\n", + " ('俄罗', 10380),\n", + " ('信息', 10324),\n", + " ('部门', 10301),\n", + " ('政策', 10252),\n", + " ('香港', 10161),\n", + " ('环境', 10133),\n", + " ('联合', 10123),\n", + " ('改革', 10095),\n", + " ('线足', 10073),\n", + " ('开始', 10069),\n", + " ('英国', 10051),\n", + " ('增长', 10003),\n", + " ('n国', 9939),\n", + " ('公开', 9910),\n", + " ('6年', 9898),\n", + " ('网球', 9876),\n", + " ('可能', 9800),\n", + " ('赛季', 9782),\n", + " ('平台', 9763),\n", + " ('报道', 9727),\n", + " ('作为', 9679),\n", + " ('在比', 9666),\n", + " ('赛n', 9574),\n", + " ('其中', 9548),\n", + " ('银行', 9538),\n", + " ('金融', 9523),\n", + " ('机构', 9346),\n", + " ('参加', 9336),\n", + " ('23', 9327),\n", + " ('支持', 9327),\n", + " ('大利', 9315),\n", + " ('25', 9312),\n", + " ('完新', 9295),\n", + " ('冠军', 9270),\n", + " ('关系', 9260),\n", + " ('教育', 9228),\n", + " ('生活', 9207),\n", + " ('21', 9187),\n", + " ('第二', 9158),\n", + " ('以及', 9147),\n", + " ('是一', 9139),\n", + " ('在2', 9127),\n", + " ('德国', 8979),\n", + " ('旅游', 8965),\n", + " ('获得', 8956),\n", + " ('时装', 8948),\n", + " ('亿元', 8923),\n", + " ('情况', 8913),\n", + " ('开展', 8895),\n", + " ('主要', 8871),\n", + " ('50', 8844),\n", + " ('当地', 8839),\n", + " ('发生', 8802),\n", + " ('日本', 8800),\n", + " ('推进', 8791),\n", + " ('影响', 8739),\n", + " ('生产', 8724),\n", + " ('部分', 8710),\n", + " ('22', 8700),\n", + " ('设计', 8681),\n", + " ('需要', 8623),\n", + " ('一步', 8608),\n", + " ('这一', 8586),\n", + " ('系统', 8582),\n", + " ('n这', 8511),\n", + " ('要求', 8389),\n", + " ('一些', 8372),\n", + " ('我国', 8357),\n", + " ('了一', 8259),\n", + " ('介绍', 8252),\n", + " ('显示', 8227),\n", + " ('进入', 8197),\n", + " ('领域', 8084),\n", + " ('美元', 8060),\n", + " ('新n', 8018),\n", + " ('推动', 8002),\n", + " ('以来', 7949),\n", + " ('电影', 7928),\n", + " ('包括', 7918),\n", + " ('5月', 7916),\n", + " ('这些', 7915),\n", + " ('国内', 7913),\n", + " ('发现', 7890),\n", + " ('62', 7886),\n", + " ('社发', 7857),\n", + " ('4月', 7783),\n", + " ('这个', 7775),\n", + " ('交易', 7765),\n", + " ('保护', 7757),\n", + " ('共同', 7754),\n", + " ('的2', 7753),\n", + " ('使用', 7738),\n", + " ('计划', 7733),\n", + " ('代表', 7703),\n", + " ('一名', 7685),\n", + " ('工程', 7652),\n", + " ('传统', 7639),\n", + " ('之一', 7633),\n", + " ('未来', 7593),\n", + " ('基础', 7591),\n", + " ('年来', 7583),\n", + " ('参与', 7575),\n", + " ('实施', 7568),\n", + " ('n在', 7524),\n", + " ('就是', 7469),\n", + " ('调查', 7446),\n", + " ('24', 7445),\n", + " ('孩子', 7397),\n", + " ('开赛', 7395),\n", + " ('行业', 7341),\n", + " ('上的', 7320),\n", + " ('集团', 7298),\n", + " ('来自', 7296),\n", + " ('资金', 7289),\n", + " ('历史', 7278),\n", + " ('进一', 7264),\n", + " ('年的', 7253),\n", + " ('中央', 7244),\n", + " ('加强', 7242),\n", + " ('科学', 7239),\n", + " ('16', 7193),\n", + " ('28', 7173),\n", + " ('大的', 7171),\n", + " ('队n', 7145),\n", + " ('7赛', 7121),\n", + " ('31', 7115),\n", + " ('了解', 7101),\n", + " ('现在', 7087),\n", + " ('希望', 7085),\n", + " ('出现', 7070),\n", + " ('17', 7024),\n", + " ('方式', 7022),\n", + " ('资源', 7018),\n", + " ('的重', 7007),\n", + " ('26', 6926),\n", + " ('正在', 6906),\n", + " ('负责', 6840),\n", + " ('价格', 6823),\n", + " ('第三', 6804),\n", + " ('运动', 6767),\n", + " ('的中', 6759),\n", + " ('韩国', 6736),\n", + " ('举办', 6733),\n", + " ('个人', 6730),\n", + " ('不断', 6725),\n", + " ('国的', 6721),\n", + " ('n3', 6719),\n", + " ('积极', 6711),\n", + " ('27', 6689),\n", + " ('的人', 6671),\n", + " ('完成', 6661),\n", + " ('巴黎', 6633),\n", + " ('网络', 6633),\n", + " ('持续', 6613),\n", + " ('n6', 6560),\n", + " ('路透', 6533),\n", + " ('赛第', 6484),\n", + " ('年6', 6481),\n", + " ('中的', 6457),\n", + " ('超过', 6453),\n", + " ('月1', 6440),\n", + " ('0多', 6432),\n", + " ('继续', 6429),\n", + " ('也是', 6373),\n", + " ('的是', 6371),\n", + " ('特朗', 6349),\n", + " ('法新', 6339),\n", + " ('交流', 6323),\n", + " ('更多', 6319),\n", + " ('去年', 6315),\n", + " ('朗普', 6315),\n", + " ('月2', 6302),\n", + " ('学院', 6274),\n", + " ('学校', 6268),\n", + " ('会议', 6263),\n", + " ('12', 6255),\n", + " ('庆祝', 6241),\n", + " ('能力', 6236),\n", + " ('其他', 6235),\n", + " ('斯坦', 6230),\n", + " ('来的', 6216),\n", + " ('根据', 6200),\n", + " ('万元', 6172),\n", + " ('人的', 6161),\n", + " ('欧新', 6159),\n", + " ('标准', 6157),\n", + " ('艺术', 6156),\n", + " ('欧洲', 6152),\n", + " ('到了', 6148),\n", + " ('贸易', 6135),\n", + " ('社欧', 6129),\n", + " ('全面', 6129),\n", + " ('制造', 6122),\n", + " ('提高', 6118),\n", + " ('提升', 6100),\n", + " ('70', 6084),\n", + " ('主席', 6077),\n", + " ('30', 6065),\n", + " ('一次', 6046),\n", + " ('委员', 6033),\n", + " ('汽车', 6018),\n", + " ('3月', 6001),\n", + " ('11', 5998),\n", + " ('对于', 5989),\n", + " ('如果', 5989),\n", + " ('n一', 5965),\n", + " ('生态', 5962),\n", + " ('现场', 5961),\n", + " ('开发', 5960),\n", + " ('15', 5946),\n", + " ('主场', 5943),\n", + " ('n据', 5941),\n", + " ('有关', 5940),\n", + " ('的新', 5928),\n", + " ('战略', 5892),\n", + " ('专家', 5878),\n", + " ('首都', 5850),\n", + " ('有限', 5842),\n", + " ('29', 5842),\n", + " ('业的', 5831),\n", + " ('江苏', 5821),\n", + " ('因为', 5812),\n", + " ('台湾', 5780),\n", + " ('场以', 5773),\n", + " ('专业', 5770),\n", + " ('最大', 5768),\n", + " ('40', 5749),\n", + " ('河北', 5745),\n", + " ('成功', 5721),\n", + " ('n中', 5719),\n", + " ('开放', 5702),\n", + " ('19', 5692),\n", + " ('增加', 5690),\n", + " ('展示', 5671),\n", + " ('重点', 5668),\n", + " ('队主', 5662),\n", + " ('建立', 5654),\n", + " ('人们', 5649),\n", + " ('智能', 5648),\n", + " ('提出', 5647),\n", + " ('国人', 5594),\n", + " ('规模', 5591),\n", + " ('比0', 5577),\n", + " ('交通', 5576),\n", + " ('群众', 5575),\n", + " ('民币', 5575),\n", + " ('正式', 5573),\n", + " ('社法', 5557),\n", + " ('报告', 5554),\n", + " ('BA', 5552),\n", + " ('工业', 5552),\n", + " ('的时', 5529),\n", + " ('行为', 5516),\n", + " ('社北', 5509),\n", + " ('新的', 5500),\n", + " ('领导', 5477),\n", + " ('0万', 5471),\n", + " ('健康', 5468),\n", + " ('消费', 5467),\n", + " ('品牌', 5461),\n", + " ('的大', 5449),\n", + " ('会上', 5442),\n", + " ('水平', 5432),\n", + " ('行动', 5429),\n", + " ('促进', 5413),\n", + " ('5年', 5408),\n", + " ('造成', 5399),\n", + " ('接受', 5377),\n", + " ('论坛', 5374),\n", + " ('澳大', 5371),\n", + " ('基金', 5371),\n", + " ('事件', 5366),\n", + " ('不同', 5361),\n", + " ('农业', 5347),\n", + " ('很多', 5336),\n", + " ('学习', 5313),\n", + " ('0年', 5311),\n", + " ('两国', 5305),\n", + " ('国选', 5304),\n", + " ('互联', 5302),\n", + " ('们的', 5301),\n", + " ('利用', 5295),\n", + " ('线网', 5292),\n", + " ('游客', 5291),\n", + " ('尼亚', 5290),\n", + " ('风险', 5281),\n", + " ('公里', 5281),\n", + " ('说n', 5269),\n", + " ('铁路', 5252),\n", + " ('轮比', 5245),\n", + " ('系列', 5244),\n", + " ('业务', 5239),\n", + " ('规定', 5234),\n", + " ('在一', 5231),\n", + " ('这样', 5229),\n", + " ('广州', 5224),\n", + " ('责任', 5208),\n", + " ('的发', 5189),\n", + " ('区的', 5187),\n", + " ('广东', 5182),\n", + " ('解决', 5182),\n", + " ('1外', 5173),\n", + " ('政治', 5158),\n", + " ('篮球', 5154),\n", + " ('当天', 5141),\n", + " ('双方', 5117),\n", + " ('监管', 5116),\n", + " ('关注', 5100),\n", + " ('比1', 5094),\n", + " ('社路', 5093),\n", + " ('男子', 5085),\n", + " ('带来', 5075),\n", + " ('戛纳', 5065),\n", + " ('作品', 5058),\n", + " ('习近', 5055),\n", + " ('近平', 5054),\n", + " ('有一', 5049),\n", + " ('地方', 5043),\n", + " ('随着', 5039),\n", + " ('综合', 5030),\n", + " ('医院', 5022),\n", + " ('标题', 5007),\n", + " ('2外', 4981),\n", + " ('60', 4960),\n", + " ('医疗', 4955),\n", + " ('用户', 4954),\n", + " ('内容', 4952),\n", + " ('个月', 4939),\n", + " ('不仅', 4937),\n", + " ('行n', 4933),\n", + " ('开幕', 4928),\n", + " ('设施', 4919),\n", + " ('月5', 4917),\n", + " ('形成', 4912),\n", + " ('规划', 4909),\n", + " ('区域', 4909),\n", + " ('己的', 4904),\n", + " ('首次', 4904),\n", + " ('分别', 4901),\n", + " ('拍摄', 4900),\n", + " ('非常', 4885),\n", + " ('精神', 4877),\n", + " ('达到', 4860),\n", + " ('人士', 4857),\n", + " ('贫困', 4851),\n", + " ('位于', 4850),\n", + " ('存在', 4846),\n", + " ('由于', 4831),\n", + " ('需求', 4820),\n", + " ('分析', 4803),\n", + " ('2比', 4801),\n", + " ('期间', 4780),\n", + " ('机制', 4775),\n", + " ('过程', 4775),\n", + " ('1日', 4770),\n", + " ('以上', 4748),\n", + " ('制度', 4739),\n", + " ('儿童', 4732),\n", + " ('在法', 4724),\n", + " ('n图', 4715),\n", + " ('取得', 4713),\n", + " ('球n', 4711),\n", + " ('基本', 4704),\n", + " ('新疆', 4703),\n", + " ('不是', 4700),\n", + " ('塞尔', 4681),\n", + " ('帮助', 4673),\n", + " ('3外', 4670),\n", + " ('特别', 4664),\n", + " ('完n', 4661),\n", + " ('创业', 4646),\n", + " ('什么', 4641),\n", + " ('展的', 4641),\n", + " ('在这', 4639),\n", + " ('国国', 4636),\n", + " ('一起', 4635),\n", + " ('限公', 4634),\n", + " ('决定', 4631),\n", + " ('的第', 4624),\n", + " ('山东', 4602),\n", + " ('保障', 4602),\n", + " ('目标', 4580),\n", + " ('天津', 4579),\n", + " ('最高', 4576),\n", + " ('联网', 4574),\n", + " ('指出', 4570),\n", + " ('基地', 4568),\n", + " ('宣布', 4567),\n", + " ('员会', 4552),\n", + " ('NB', 4552),\n", + " ('年1', 4546),\n", + " ('青年', 4544),\n", + " ('为了', 4542),\n", + " ('里的', 4542),\n", + " ('还是', 4541),\n", + " ('强n', 4538),\n", + " ('训练', 4538),\n", + " ('意大', 4527),\n", + " ('处理', 4518),\n", + " ('单位', 4512),\n", + " ('作用', 4510),\n", + " ('人才', 4509),\n", + " ('此次', 4502),\n", + " ('收入', 4499),\n", + " ('优势', 4495),\n", + " ('的国', 4493),\n", + " ('好的', 4474),\n", + " ('月4', 4452),\n", + " ('空间', 4442),\n", + " ('打造', 4441),\n", + " ('书记', 4433),\n", + " ('浙江', 4430),\n", + " ('一直', 4428),\n", + " ('所有', 4421),\n", + " ('消息', 4419),\n", + " ('最终', 4411),\n", + " ('装周', 4408),\n", + " ('关键', 4397),\n", + " ('模式', 4390),\n", + " ('0战', 4387),\n", + " ('体系', 4364),\n", + " ('80', 4353),\n", + " ('作的', 4349),\n", + " ('还有', 4346),\n", + " ('成立', 4345),\n", + " ('如何', 4345),\n", + " ('经营', 4337),\n", + " ('这种', 4332),\n", + " ('50', 4331),\n", + " ('以2', 4328),\n", + " ('印度', 4328),\n", + " ('电视', 4313),\n", + " ('4外', 4301),\n", + " ('出了', 4295),\n", + " ('销售', 4295),\n", + " ('图表', 4285),\n", + " ('之后', 4283),\n", + " ('近年', 4275),\n", + " ('在中', 4271),\n", + " ('小时', 4266),\n", + " ('京2', 4259),\n", + " ('吸引', 4257),\n", + " ('标赛', 4253),\n", + " ('在北', 4247),\n", + " ('动物', 4231),\n", + " ('女子', 4226),\n", + " ('任务', 4218),\n", + " ('锦标', 4215),\n", + " ('6年', 4210),\n", + " ('具有', 4208),\n", + " ('西班', 4205),\n", + " ('月6', 4203),\n", + " ('武汉', 4198),\n", + " ('班牙', 4195),\n", + " ('重大', 4194),\n", + " ('之路', 4185),\n", + " ('协议', 4183),\n", + " ('启动', 4181),\n", + " ('多的', 4168),\n", + " ('按照', 4155),\n", + " ('02', 4155),\n", + " ('影片', 4150),\n", + " ('有效', 4150),\n", + " ('13', 4147),\n", + " ('样的', 4146),\n", + " ('袭击', 4140),\n", + " ('超联', 4139),\n", + " ('一年', 4135),\n", + " ('纽约', 4133),\n", + " ('受到', 4127),\n", + " ('航空', 4126),\n", + " ('英超', 4126),\n", + " ('n小', 4123),\n", + " ('死亡', 4113),\n", + " ('能够', 4111),\n", + " ('指数', 4105),\n", + " ('关于', 4103),\n", + " ('选择', 4102),\n", + " ('不少', 4101),\n", + " ('出的', 4101),\n", + " ('不能', 4094),\n", + " ('得到', 4085),\n", + " ('德里', 4078),\n", + " ('军n', 4073),\n", + " ('文明', 4073),\n", + " ('共享', 4065),\n", + " ('月9', 4056),\n", + " ('扶贫', 4051),\n", + " ('近日', 4048),\n", + " ('国队', 4042),\n", + " ('最后', 4040),\n", + " ('都是', 4037),\n", + " ('美联', 4029),\n", + " ('能源', 4029),\n", + " ('成果', 4023),\n", + " ('表演', 4022),\n", + " ('14', 4021),\n", + " ('稳定', 4016),\n", + " ('选举', 4011),\n", + " ('斯特', 4003),\n", + " ('主题', 4002),\n", + " ('伊斯', 3995),\n", + " ('合国', 3989),\n", + " ('的主', 3989),\n", + " ('博物', 3982),\n", + " ('球法', 3982),\n", + " ('措施', 3970),\n", + " ('动n', 3967),\n", + " ('统计', 3963),\n", + " ('坚持', 3962),\n", + " ('一家', 3959),\n", + " ('看到', 3957),\n", + " ('夺冠', 3950),\n", + " ('主任', 3950),\n", + " ('预计', 3950),\n", + " ('教练', 3946),\n", + " ('将于', 3940),\n", + " ('在美', 3940),\n", + " ('下一', 3928),\n", + " ('岁的', 3928),\n", + " ('日新', 3927),\n", + " ('家庭', 3919),\n", + " ('成了', 3919),\n", + " ('上涨', 3910),\n", + " ('叙利', 3902),\n", + " ('乒乓', 3901),\n", + " ('摄的', 3897),\n", + " ('将在', 3890),\n", + " ('同比', 3888),\n", + " ('日报', 3874),\n", + " ('乓球', 3873),\n", + " ('的生', 3873),\n", + " ('落实', 3869),\n", + " ('阶段', 3864),\n", + " ('结果', 3859),\n", + " ('5外', 3856),\n", + " ('努力', 3855),\n", + " ('警方', 3853),\n", + " ('条件', 3853),\n", + " ('多个', 3838),\n", + " ('此外', 3827),\n", + " ('直接', 3826),\n", + " ('的工', 3826),\n", + " ('球英', 3826),\n", + " ('团队', 3822),\n", + " ('法律', 3820),\n", + " ('分钟', 3818),\n", + " ('是在', 3814),\n", + " ('全部', 3813),\n", + " ('变化', 3812),\n", + " ('建筑', 3808),\n", + " ('结构', 3804),\n", + " ('18', 3804),\n", + " ('公布', 3804),\n", + " ('责人', 3803),\n", + " ('编辑', 3801),\n", + " ('不过', 3800),\n", + " ('化的', 3795),\n", + " ('告诉', 3787),\n", + " ('虽然', 3780),\n", + " ('两个', 3778),\n", + " ('月7', 3777),\n", + " ('重庆', 3775),\n", + " ('5日', 3773),\n", + " ('时候', 3767),\n", + " ('7年', 3762),\n", + " ('发挥', 3762),\n", + " ('小标', 3762),\n", + " ('方案', 3761),\n", + " ('保持', 3758),\n", + " ('严重', 3756),\n", + " ('的成', 3755),\n", + " ('出席', 3755),\n", + " ('物馆', 3740),\n", + " ('河南', 3739),\n", + " ('播发', 3735),\n", + " ('经过', 3734),\n", + " ('执行', 3733),\n", + " ('目的', 3730),\n", + " ('毕业', 3719),\n", + " ('朝鲜', 3717),\n", + " ('更加', 3709),\n", + " ('中n', 3700),\n", + " ('日中', 3695),\n", + " ('结束', 3695),\n", + " ('这里', 3693),\n", + " ('核心', 3687),\n", + " ('伦敦', 3678),\n", + " ('的地', 3673),\n", + " ('明显', 3673),\n", + " ('成绩', 3672),\n", + " ('是中', 3672),\n", + " ('球队', 3663),\n", + " ('国务', 3660),\n", + " ('质量', 3659),\n", + " ('锦赛', 3657),\n", + " ('以1', 3654),\n", + " ('为中', 3652),\n", + " ('有的', 3647),\n", + " ('25', 3646),\n", + " ('各地', 3645),\n", + " ('调整', 3644),\n", + " ('万人', 3643),\n", + " ('他的', 3642),\n", + " ('倡议', 3639),\n", + " ('斯科', 3638),\n", + " ('地产', 3637),\n", + " ('法院', 3633),\n", + " ('村民', 3632),\n", + " ('针对', 3632),\n", + " ('行了', 3611),\n", + " ('环保', 3608),\n", + " ('行政', 3608),\n", + " ('之间', 3602),\n", + " ('意见', 3597),\n", + " ('半决', 3592),\n", + " ('卫星', 3590),\n", + " ('新新', 3579),\n", + " ('大会', 3579),\n", + " ('展n', 3577),\n", + " ('内的', 3577),\n", + " ('1战', 3569),\n", + " ('后的', 3563),\n", + " ('生的', 3562),\n", + " ('影节', 3562),\n", + " ('现了', 3557),\n", + " ('家的', 3554),\n", + " ('面积', 3553),\n", + " ('特色', 3551),\n", + " ('高峰', 3550),\n", + " ('小学', 3547),\n", + " ('成本', 3543),\n", + " ('来越', 3543),\n", + " ('自然', 3540),\n", + " ('一种', 3539),\n", + " ('成员', 3537),\n", + " ('n2', 3532),\n", + " ('长期', 3531),\n", + " ('干部', 3531),\n", + " ('秀n', 3531),\n", + " ('时代', 3530),\n", + " ('脱贫', 3526),\n", + " ('仪式', 3524),\n", + " ('协会', 3516),\n", + " ('日摄', 3509),\n", + " ('过去', 3499),\n", + " ('价值', 3491),\n", + " ('导致', 3487),\n", + " ('一场', 3485),\n", + " ('居民', 3485),\n", + " ('截至', 3481),\n", + " ('音乐', 3481),\n", + " ('马德', 3475),\n", + " ('运会', 3473),\n", + " ('的情', 3471),\n", + " ('要的', 3469),\n", + " ('机会', 3469),\n", + " ('知识', 3465),\n", + " ('越来', 3463),\n", + " ('主义', 3459),\n", + " ('3日', 3456),\n", + " ('布会', 3455),\n", + " ('欧盟', 3450),\n", + " ('公安', 3449),\n", + " ('卫生', 3441),\n", + " ('职业', 3435),\n", + " ('外交', 3429),\n", + " ('准备', 3424),\n", + " ('场的', 3422),\n", + " ('范围', 3422),\n", + " ('6外', 3422),\n", + " ('不会', 3420),\n", + " ('但是', 3418),\n", + " ('南省', 3407),\n", + " ('甚至', 3404),\n", + " ('一定', 3402),\n", + " ('因此', 3400),\n", + " ('湖北', 3398),\n", + " ('民族', 3393),\n", + " ('升级', 3381),\n", + " ('级n', 3379),\n", + " ('节n', 3378),\n", + " ('应用', 3378),\n", + " ('控制', 3377),\n", + " ('甲联', 3372),\n", + " ('垃圾', 3372),\n", + " ('为主', 3369),\n", + " ('法网', 3367),\n", + " ('6日', 3365),\n", + " ('3比', 3362),\n", + " ('国总', 3362),\n", + " ('季度', 3356),\n", + " ('原因', 3356),\n", + " ('实际', 3356),\n", + " ('明确', 3355),\n", + " ('商品', 3353),\n", + " ('马拉', 3351),\n", + " ('公园', 3349),\n", + " ('每年', 3349),\n", + " ('网站', 3347),\n", + " ('克斯', 3346),\n", + " ('的最', 3338),\n", + " ('自由', 3335),\n", + " ('冠n', 3334),\n", + " ('2日', 3333),\n", + " ('犯罪', 3331),\n", + " ('市民', 3330),\n", + " ('部长', 3330),\n", + " ('巴西', 3329),\n", + " ('者李', 3327),\n", + " ('完善', 3326),\n", + " ('手机', 3325),\n", + " ('赛男', 3324),\n", + " ('羽毛', 3323),\n", + " ('和平', 3321),\n", + " ('日前', 3313),\n", + " ('人在', 3312),\n", + " ('培训', 3303),\n", + " ('毛球', 3291),\n", + " ('无人', 3288),\n", + " ('必须', 3287),\n", + " ('月8', 3284),\n", + " ('中华', 3281),\n", + " ('机器', 3279),\n", + " ('就业', 3279),\n", + " ('设备', 3276),\n", + " ('如今', 3265),\n", + " ('育1', 3257),\n", + " ('西亚', 3254),\n", + " ('体验', 3253),\n", + " ('的高', 3250),\n", + " ('布的', 3249),\n", + " ('加快', 3248),\n", + " ('0日', 3246),\n", + " ('多年', 3246),\n", + " ('并不', 3240),\n", + " ('附近', 3238),\n", + " ('功能', 3237),\n", + " ('21', 3236),\n", + " ('公共', 3230),\n", + " ('州市', 3226),\n", + " ('航天', 3221),\n", + " ('分之', 3218),\n", + " ('运营', 3210),\n", + " ('和国', 3210),\n", + " ('资产', 3207),\n", + " ('幕n', 3203),\n", + " ('4日', 3202),\n", + " ('地的', 3201),\n", + " ('挑战', 3184),\n", + " ('土耳', 3179),\n", + " ('耳其', 3179),\n", + " ('飞机', 3178),\n", + " ('35', 3176),\n", + " ('集中', 3175),\n", + " ('社区', 3174),\n", + " ('欧冠', 3169),\n", + " ('定的', 3165),\n", + " ('农民', 3164),\n", + " ('亚洲', 3163),\n", + " ('们在', 3162),\n", + " ('式n', 3161),\n", + " ('05', 3160),\n", + " ('图片', 3158),\n", + " ('现代', 3155),\n", + " ('程中', 3152),\n", + " ('4年', 3152),\n", + " ('新区', 3151),\n", + " ('突破', 3149),\n", + " ('0多', 3148),\n", + " ('n社', 3144),\n", + " ('亿美', 3140),\n", + " ('除了', 3127),\n", + " ('1月', 3126),\n", + " ('只有', 3126),\n", + " ('中中', 3124),\n", + " ('者王', 3117),\n", + " ('会在', 3116),\n", + " ('莫斯', 3116),\n", + " ('为一', 3114),\n", + " ('竞争', 3113),\n", + " ('贵州', 3107),\n", + " ('产生', 3104),\n", + " ('总理', 3102),\n", + " ('深入', 3101),\n", + " ('深圳', 3101),\n", + " ('委会', 3101),\n", + " ('发表', 3097),\n", + " ('案件', 3097),\n", + " ('声明', 3095),\n", + " ('商业', 3094),\n", + " ('中超', 3090),\n", + " ('大家', 3087),\n", + " ('力量', 3080),\n", + " ('沿线', 3079),\n", + " ('左右', 3077),\n", + " ('99', 3074),\n", + " ('平方', 3073),\n", + " ('7日', 3072),\n", + " ('摄影', 3067),\n", + " ('年前', 3066),\n", + " ('的比', 3063),\n", + " ('人n', 3060),\n", + " ('保险', 3051),\n", + " ('两岸', 3050),\n", + " ('第十', 3044),\n", + " ('以3', 3043),\n", + " ('球欧', 3040),\n", + " ('展开', 3038),\n", + " ('第3', 3032),\n", + " ('采访', 3032),\n", + " ('办公', 3031),\n", + " ('36', 3029),\n", + " ('0亿', 3023),\n", + " ('广西', 3022),\n", + " ('农村', 3021),\n", + " ('观众', 3017),\n", + " ('电子', 3005),\n", + " ('连续', 3005),\n", + " ('研发', 3005),\n", + " ('电话', 3004),\n", + " ('机关', 3002),\n", + " ('起来', 2998),\n", + " ('片北', 2994),\n", + " ('自治', 2991),\n", + " ('四川', 2988),\n", + " ('一位', 2983),\n", + " ('融资', 2982),\n", + " ('本次', 2975),\n", + " ('作人', 2975),\n", + " ('的基', 2973),\n", + " ('罗马', 2970),\n", + " ('7外', 2964),\n", + " ('另一', 2963),\n", + " ('结合', 2961),\n", + " ('面的', 2960),\n", + " ('他说', 2959),\n", + " ('视频', 2959),\n", + " ('治理', 2957),\n", + " ('个国', 2956),\n", + " ('此前', 2953),\n", + " ('23', 2952),\n", + " ('下降', 2951),\n", + " ('检查', 2951),\n", + " ('22', 2948),\n", + " ('支付', 2948),\n", + " ('食品', 2947),\n", + " ...]" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "two_gram_counts.most_common()[0:100]" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [], + "source": [ + "get_pair_prob = get_probability_from_counts(two_gram_counts)\n", + "\n", + "def get_2_gram_prob(word,prev): # probability of seeing \"word\" given \"prev\"\n", + " if get_pair_prob(prev+word) > 0:\n", + " return get_pair_prob(prev+word)/get_char_prob(prev)\n", + " else:\n", + " return get_char_prob(word)\n", + "\n", + "def get_2_gram_string_prob(string):\n", + " probList = []\n", + " for i,c in enumerate(string):\n", + " prev = '' if i == 0 else string[i-1]\n", + " probList.append(get_2_gram_prob(c,prev))\n", + " return reduce(mul,probList)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 1.2207058723774045e-31\n", + "\t\t 前天晚上吃早饭的时候 with probability 1.420433440421635e-31\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 3.2528612289150613e-25\n", + "\t\t 真是一个好看的小猫 with probability 1.0220793879946632e-25\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 3.7425390630342124e-22\n", + "\t\t 我简直无言以对 with probability 3.742539063034212e-22\n" + ] + } + ], + "source": [ + "get_probability_prefromance(prob_of_string, pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 6.847690505341362e-20\n", + "\t\t 前天晚上吃早饭的时候 with probability 1.7483929208056836e-19\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 1.313877986865493e-16\n", + "\t\t 真是一个好看的小猫 with probability 8.984863857283642e-17\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 2.143887688284483e-17\n", + "\t\t 我简直无言以对 with probability 2.0730979185519055e-17\n" + ] + } + ], + "source": [ + "get_probability_prefromance(get_2_gram_string_prob, pairs)" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb b/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb new file mode 100644 index 0000000..9a7478c --- /dev/null +++ b/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "from collections import Counter\n", + "from collections import defaultdict\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# wikipedia, smoothing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Processing Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 运行 python WikiExtractor.py -b 2000M zhwiki-20181101-pages-articles.xml.bz2\n", + "# 没有做繁体转简体处理,不会安装opencc的windows版本 :-(" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "MemoryError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mall_wiki_content\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'D://pyproject//git//AI-NLP//data//text//AA//wiki_00'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'UTF-8'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mall_wiki_content\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msub\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mr'<[^>]+>'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m''\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mall_wiki_content\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 去掉 tag\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\codecs.py\u001b[0m in \u001b[0;36mdecode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[1;31m# decode input (taking the buffer into account)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m \u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsumed\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_buffer_decode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 322\u001b[0m \u001b[1;31m# keep undecoded input until the next call\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 323\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mconsumed\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mMemoryError\u001b[0m: " + ] + } + ], + "source": [ + "all_wiki_content = open('D://pyproject//git//AI-NLP//data//text//AA//wiki_00',encoding='UTF-8').read()\n", + "all_wiki_content = re.sub(r'<[^>]+>','',all_wiki_content) # 去掉 tag" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 在8g内存的电脑上能跑完,但在内存小的电脑上报MemeoryError" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize(string): \n", + " #return ''.join(re.findall('[\\w|\\d]+', string)) # 不太明白两种写法的区别,w也能匹配数字,但输出结果是有不同\n", + " return ''.join(re.findall('\\w+', string))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "380434793" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_character = tokenize(all_wiki_content)\n", + "del all_wiki_content # 释放内存\n", + "len(all_character)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('的', 9938192),\n", + " ('1', 5740539),\n", + " ('0', 4559519),\n", + " ('年', 4088849),\n", + " ('2', 3705103),\n", + " ('一', 3174566),\n", + " ('在', 3142422),\n", + " ('是', 2800422),\n", + " ('中', 2763222),\n", + " ('9', 2730241),\n", + " ('人', 2610319),\n", + " ('大', 2095073),\n", + " ('有', 2064509),\n", + " ('e', 1885083),\n", + " ('a', 1789303),\n", + " ('3', 1753587),\n", + " ('5', 1721315),\n", + " ('和', 1705550),\n", + " ('為', 1662714),\n", + " ('8', 1646008)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_character_counts = Counter(all_character)\n", + "all_character_counts.most_common()[0:20]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('20', 1579014),\n", + " ('19', 1442094),\n", + " ('00', 1225241),\n", + " ('01', 853922),\n", + " ('10', 547006),\n", + " ('年1', 527492),\n", + " ('的一', 469028),\n", + " ('12', 444080),\n", + " ('11', 419457),\n", + " ('0年', 417267),\n", + " ('一个', 400248),\n", + " ('18', 387729),\n", + " ('人口', 349391),\n", + " ('99', 340092),\n", + " ('中国', 328509),\n", + " ('1年', 322136),\n", + " ('公里', 320126),\n", + " ('5年', 318534),\n", + " ('月1', 318147),\n", + " ('er', 316517)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gram_length = 2\n", + "two_gram_counts = {}\n", + "for i in range(len(all_character)-gram_length): # 用 for循环省内存\n", + " k = all_character[i:i+gram_length]\n", + " two_gram_counts[k] = (two_gram_counts[k]+ 1) if (k in two_gram_counts.keys()) else 1\n", + " \n", + "two_gram_counts = Counter(two_gram_counts)\n", + "two_gram_counts.most_common()[0:20]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unigram, Good-Turing smoothing" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21491" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_character_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def get_char_prob_from_counts(counts,k=5): \n", + " # Good-Turing smoothing\n", + " n = sum(counts.values())\n", + " # 计算nr\n", + " nr = defaultdict(int)\n", + " for i in counts.values():\n", + " nr[i] += 1\n", + " nr[0] = 90000 - len(counts) # 假设汉字共9万个\n", + " # 计算rstar\n", + " rstar = [0]*(k+1) \n", + " total_decreased = 0\n", + " for r in range(1,k+1,1):\n", + " rstar[r] = (r+1)*nr[r+1]/nr[r]\n", + " total_decreased += (r*nr[r] - rstar[r]*nr[r])\n", + " #print(r,rstar[r])\n", + " rstar[0] = total_decreased / nr[0]\n", + " #print (0,rstar[0])\n", + " def get_prob(char):\n", + " occurence = counts.get(char,0)\n", + " return rstar[occurence]/n if occurence<=k else occurence/n\n", + " return get_prob\n", + "\n", + "get_char_prob = get_char_prob_from_counts(all_character_counts,k=5)\n", + "\n", + "from functools import reduce\n", + "from operator import mul\n", + "def get_1_gram_string_prob(string):\n", + " return reduce(mul,[get_char_prob(char) for char in string])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "pair = \"\"\"前天晚上吃晚饭的时候\n", + "前天晚上吃早饭的时候\"\"\".split('\\n')\n", + "\n", + "pair2 = \"\"\"正是一个好看的小猫\n", + "真是一个好看的小猫\"\"\".split('\\n')\n", + "\n", + "pair3 = \"\"\"我无言以对,简直\n", + "我简直无言以对\"\"\".split('\\n')\n", + "\n", + "pairs = [pair, pair2, pair3]\n", + "def get_probability_prefromance(language_model_func, pairs):\n", + " for (p1, p2) in pairs:\n", + " print('*'*18)\n", + " print('\\t\\t {} with probability {}'.format(p1, language_model_func(tokenize(p1)))) # tokenize去掉','这样的标点\n", + " print('\\t\\t {} with probability {}'.format(p2, language_model_func(tokenize(p2))))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 2.3223066267509665e-33\n", + "\t\t 前天晚上吃早饭的时候 with probability 4.678562566970852e-33\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 1.1087006396816684e-26\n", + "\t\t 真是一个好看的小猫 with probability 3.4663369707956e-27\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 1.747335364002409e-23\n", + "\t\t 我简直无言以对 with probability 1.747335364002409e-23\n" + ] + } + ], + "source": [ + "get_probability_prefromance(get_1_gram_string_prob, pairs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2-gram, Katz back-off smoothing" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "two_gram_table ={}\n", + "for w in two_gram_counts.keys():\n", + " if w[0] not in two_gram_table.keys():\n", + " two_gram_table[w[0]] = {}\n", + " two_gram_table[w[0]][w[1]] = two_gram_counts[w]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def get_2_gram_prob_from_counts(counts,k=5): \n", + " # Katz smoothing\n", + " n = sum(counts.values())\n", + " # 计算nr\n", + " nr = defaultdict(int)\n", + " for i in counts.values():\n", + " nr[i] += 1\n", + "\n", + " # 计算dr, for 1<= r<=k\n", + " dr = [1]*(k+1) # dr[0] 不使用\n", + " tmp = (k+1)*nr[k+1]/nr[1]\n", + " for r in range(1,k+1,1):\n", + " rstar = (r+1)*nr[r+1]/nr[r]\n", + " dr[r] = (rstar/r-tmp)/(1-tmp)\n", + "\n", + " # 计算 two_gram_table,最终存储了所有pair的条件概率\n", + " # 计算 a,回退到unigram的系数\n", + " two_gram_table ={}\n", + " a = {}\n", + " for w in counts.keys():\n", + " if w[0] not in two_gram_table.keys():\n", + " two_gram_table[w[0]] = {}\n", + " two_gram_table[w[0]][w[1]] = counts[w]\n", + " for w0 in two_gram_table.keys():\n", + " n0 = sum(two_gram_table[w0].values())\n", + " for w1 in two_gram_table[w0].keys():\n", + " c = two_gram_table[w0][w1]\n", + " if c > k:\n", + " two_gram_table[w0][w1] = c/n0\n", + " else:\n", + " two_gram_table[w0][w1] = dr[c]*c/n0\n", + " sumkatz = sum(two_gram_table[w0].values())\n", + " sumSeenUnigram = sum(get_char_prob(e) for e in two_gram_table[w0].keys())\n", + " a[w0] = (1-sumkatz)/(1-sumSeenUnigram)\n", + " \n", + " \n", + " def get_prob(word,prev):\n", + " occurence = counts.get(prev+word,0)\n", + " if occurence > 0:\n", + " return two_gram_table[prev][word]\n", + " elif prev == '':\n", + " return get_char_prob(word)\n", + " else:\n", + " return a[prev]*get_char_prob(word) \n", + " return get_prob\n", + "\n", + "get_2_gram_prob = get_2_gram_prob_from_counts(two_gram_counts,k=5)\n", + "\n", + "def get_2_gram_string_prob(string):\n", + " probList = []\n", + " for i,c in enumerate(string):\n", + " prev = '' if i == 0 else string[i-1]\n", + " probList.append(get_2_gram_prob(c,prev))\n", + " return reduce(mul,probList)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 2.8863673714341063e-24\n", + "\t\t 前天晚上吃早饭的时候 with probability 6.446398053347142e-25\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 6.4603668266917246e-21\n", + "\t\t 真是一个好看的小猫 with probability 9.252304558785208e-22\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 9.167158643679254e-21\n", + "\t\t 我简直无言以对 with probability 7.569514628385958e-22\n" + ] + } + ], + "source": [ + "get_probability_prefromance(get_2_gram_string_prob, pairs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2-gram 和 smoothing 并没能很好的区分这些语句。感觉问题主要是上下文的距离超过了2-gram的长度。分词后,再建立3-gram模型,或许能够区分开来。" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}