From 172650aa9c39d5c8e03663a6ea747bfa4cc1bc7e Mon Sep 17 00:00:00 2001 From: LIWEN CHENG Date: Fri, 30 Nov 2018 11:51:31 +0800 Subject: [PATCH 1/2] lecture2 --- ...Lecture-2-Language-Model-ustccheng02.ipynb | 5167 +++++++++++++++++ ...re-2-Wikipedia-Smoothing-ustccheng02.ipynb | 453 ++ 2 files changed, 5620 insertions(+) create mode 100644 2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb create mode 100644 2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb diff --git a/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb b/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb new file mode 100644 index 0000000..b873719 --- /dev/null +++ b/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb @@ -0,0 +1,5167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Probability Based " + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import requests\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ASCII" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "all_content = open('80k_articles.txt',encoding='UTF-8').read()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "34475997" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_content)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'新华社照片,东莞(广东),2017年4月7日\\\\n(体育)(9)篮球——CBA总决赛第四场:广东对阵新疆\\\\n4月7日,广东东莞银行队球员易建联在比赛中扣篮。\\\\n当日,在2016-2017赛季中国男子篮球职业联赛(CBA)总决赛第四场比赛中,广东东莞银行队主场迎战新疆喀什古城队。\\\\n新华社记者孟永民摄\\\\n\\u3000\\u3000新华社北京4月14日新媒体专电(记者杨烨)作为国民经济的重要支柱,央企一季度交上了一份漂亮的“'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_content[:200]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize(string): \n", + " #return ''.join(re.findall('[\\w|\\d]+', string)) # 不太明白两种写法的区别,w也能匹配数字,但输出结果是有不同\n", + " return ''.join(re.findall('\\w+', string))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'新华社照片东莞广东2017年4月7日n体育9篮球CBA总决赛第四场广东对阵新疆n4月7日广东东莞银行队球员易建联在比赛中扣篮n当日在20162017赛季中国男子篮球职业联赛CBA总决赛第四场比赛中广东东莞银行队主场迎战新疆喀什古城队n新华社记者孟永民摄n新华社北京4月14日新媒体专电记者杨烨作为国民经济的重要支柱央企一季度交上了一份漂亮的'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenize(all_content[:200])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "29733817" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ALL_CHARACTER = tokenize(all_content)\n", + "len(ALL_CHARACTER)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unigram" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$P(W_oW_1W_2Wn) = P(W_o) \\cdot P(W_1) \\cdot P(W_2) \\cdot P(W_n) $" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "all_character_counts = Counter(ALL_CHARACTER)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('的', 635684),\n", + " ('n', 605563),\n", + " ('国', 303683),\n", + " ('1', 285430),\n", + " ('在', 273451),\n", + " ('一', 255874),\n", + " ('中', 249541),\n", + " ('日', 248419),\n", + " ('2', 247140),\n", + " ('新', 243975),\n", + " ('0', 240159),\n", + " ('年', 197627),\n", + " ('月', 183696),\n", + " ('人', 176780),\n", + " ('大', 162508),\n", + " ('社', 159861),\n", + " ('华', 156763),\n", + " ('是', 141034),\n", + " ('和', 131350),\n", + " ('赛', 130048),\n", + " ('发', 129080),\n", + " ('有', 128925),\n", + " ('为', 126278),\n", + " ('5', 124931),\n", + " ('了', 124569),\n", + " ('行', 122928),\n", + " ('7', 121544),\n", + " ('上', 118416),\n", + " ('外', 117495),\n", + " ('4', 112639),\n", + " ('业', 112051),\n", + " ('不', 112042),\n", + " ('会', 109898),\n", + " ('代', 100566),\n", + " ('地', 96026),\n", + " ('球', 92521),\n", + " ('时', 92322),\n", + " ('3', 92131),\n", + " ('者', 91667),\n", + " ('作', 91293),\n", + " ('以', 91191),\n", + " ('家', 89815),\n", + " ('成', 89485),\n", + " ('对', 87199),\n", + " ('6', 86963),\n", + " ('市', 85776),\n", + " ('来', 85467),\n", + " ('出', 85263),\n", + " ('生', 84895),\n", + " ('个', 83544),\n", + " ('这', 81872),\n", + " ('公', 81351),\n", + " ('比', 80433),\n", + " ('动', 80430),\n", + " ('线', 80196),\n", + " ('全', 79296),\n", + " ('体', 79271),\n", + " ('二', 79008),\n", + " ('进', 78995),\n", + " ('开', 78979),\n", + " ('1', 78756),\n", + " ('0', 75949),\n", + " ('多', 75399),\n", + " ('学', 73616),\n", + " ('队', 73414),\n", + " ('到', 73247),\n", + " ('斯', 72799),\n", + " ('法', 72555),\n", + " ('合', 72090),\n", + " ('展', 71993),\n", + " ('要', 71584),\n", + " ('场', 71047),\n", + " ('方', 70982),\n", + " ('部', 70644),\n", + " ('记', 69959),\n", + " ('工', 69432),\n", + " ('经', 67717),\n", + " ('分', 66855),\n", + " ('能', 65109),\n", + " ('区', 64966),\n", + " ('前', 64792),\n", + " ('于', 64751),\n", + " ('民', 64686),\n", + " ('高', 64161),\n", + " ('当', 64148),\n", + " ('美', 64016),\n", + " ('将', 63416),\n", + " ('2', 62896),\n", + " ('与', 62647),\n", + " ('员', 62067),\n", + " ('加', 60772),\n", + " ('后', 60761),\n", + " ('片', 60694),\n", + " ('现', 60591),\n", + " ('利', 60337),\n", + " ('主', 60068),\n", + " ('产', 58963),\n", + " ('等', 58799),\n", + " ('机', 58508),\n", + " ('联', 58457),\n", + " ('特', 57946),\n", + " ('长', 57577),\n", + " ('化', 57341),\n", + " ('电', 57128),\n", + " ('用', 56850),\n", + " ('尔', 56778),\n", + " ('自', 56418),\n", + " ('建', 56356),\n", + " ('照', 55866),\n", + " ('下', 55840),\n", + " ('实', 55554),\n", + " ('过', 55069),\n", + " ('海', 54110),\n", + " ('路', 54097),\n", + " ('力', 53780),\n", + " ('北', 53433),\n", + " ('他', 53304),\n", + " ('第', 52636),\n", + " ('政', 52538),\n", + " ('西', 52510),\n", + " ('关', 52252),\n", + " ('文', 51815),\n", + " ('重', 51789),\n", + " ('同', 51457),\n", + " ('说', 50644),\n", + " ('里', 49866),\n", + " ('通', 49858),\n", + " ('内', 49753),\n", + " ('资', 49587),\n", + " ('8', 49314),\n", + " ('表', 49250),\n", + " ('9', 49168),\n", + " ('理', 49021),\n", + " ('我', 48966),\n", + " ('们', 48171),\n", + " ('本', 47561),\n", + " ('平', 47322),\n", + " ('天', 47322),\n", + " ('务', 46919),\n", + " ('金', 46747),\n", + " ('手', 46697),\n", + " ('亚', 46635),\n", + " ('目', 46335),\n", + " ('小', 46201),\n", + " ('面', 46037),\n", + " ('安', 45462),\n", + " ('子', 45203),\n", + " ('事', 44971),\n", + " ('名', 44969),\n", + " ('得', 44538),\n", + " ('网', 44437),\n", + " ('可', 44367),\n", + " ('际', 44319),\n", + " ('也', 43926),\n", + " ('设', 43772),\n", + " ('之', 43382),\n", + " ('选', 43013),\n", + " ('摄', 42876),\n", + " ('制', 42556),\n", + " ('科', 42495),\n", + " ('度', 42073),\n", + " ('品', 41941),\n", + " ('次', 41782),\n", + " ('马', 41496),\n", + " ('定', 41222),\n", + " ('入', 41218),\n", + " ('提', 41007),\n", + " ('德', 40949),\n", + " ('总', 40914),\n", + " ('最', 40389),\n", + " ('育', 40317),\n", + " ('车', 40197),\n", + " ('举', 39934),\n", + " ('就', 39834),\n", + " ('元', 39741),\n", + " ('示', 39711),\n", + " ('交', 39632),\n", + " ('期', 39215),\n", + " ('保', 39078),\n", + " ('心', 38963),\n", + " ('战', 38851),\n", + " ('京', 38809),\n", + " ('其', 38756),\n", + " ('级', 38659),\n", + " ('基', 37846),\n", + " ('拉', 37294),\n", + " ('南', 37176),\n", + " ('从', 36985),\n", + " ('报', 36816),\n", + " ('点', 36691),\n", + " ('间', 36577),\n", + " ('5', 36511),\n", + " ('都', 36440),\n", + " ('东', 36180),\n", + " ('水', 35795),\n", + " ('数', 35509),\n", + " ('三', 35431),\n", + " ('带', 35418),\n", + " ('首', 35272),\n", + " ('道', 35174),\n", + " ('两', 35000),\n", + " ('明', 34903),\n", + " ('已', 34892),\n", + " ('强', 34866),\n", + " ('统', 34786),\n", + " ('据', 34750),\n", + " ('台', 34330),\n", + " ('及', 34105),\n", + " ('布', 33948),\n", + " ('克', 33941),\n", + " ('计', 33907),\n", + " ('好', 33641),\n", + " ('城', 33555),\n", + " ('相', 33482),\n", + " ('共', 33316),\n", + " ('万', 33204),\n", + " ('巴', 33195),\n", + " ('3', 32998),\n", + " ('近', 32909),\n", + " ('题', 32907),\n", + " ('完', 32733),\n", + " ('系', 32690),\n", + " ('技', 32676),\n", + " ('世', 32568),\n", + " ('军', 32536),\n", + " ('足', 32383),\n", + " ('物', 32362),\n", + " ('位', 32256),\n", + " ('省', 32180),\n", + " ('决', 32050),\n", + " ('还', 32004),\n", + " ('司', 31879),\n", + " ('项', 31854),\n", + " ('企', 31649),\n", + " ('持', 31644),\n", + " ('意', 31635),\n", + " ('院', 31594),\n", + " ('活', 31577),\n", + " ('式', 31554),\n", + " ('创', 31538),\n", + " ('山', 31101),\n", + " ('组', 31088),\n", + " ('而', 30789),\n", + " ('4', 30775),\n", + " ('并', 30737),\n", + " ('正', 30677),\n", + " ('罗', 30349),\n", + " ('胜', 30287),\n", + " ('量', 30214),\n", + " ('性', 30081),\n", + " ('运', 30080),\n", + " ('此', 29974),\n", + " ('欧', 29798),\n", + " ('起', 29662),\n", + " ('州', 29622),\n", + " ('管', 29514),\n", + " ('更', 29438),\n", + " ('信', 29428),\n", + " ('着', 29308),\n", + " ('达', 29094),\n", + " ('影', 28990),\n", + " ('增', 28861),\n", + " ('术', 28848),\n", + " ('所', 28706),\n", + " ('至', 28630),\n", + " ('标', 28371),\n", + " ('推', 28101),\n", + " ('局', 27813),\n", + " ('广', 27800),\n", + " ('规', 27790),\n", + " ('向', 27788),\n", + " ('门', 27550),\n", + " ('济', 27487),\n", + " ('参', 27433),\n", + " ('节', 27414),\n", + " ('立', 27286),\n", + " ('6', 27259),\n", + " ('任', 27244),\n", + " ('受', 26933),\n", + " ('收', 26795),\n", + " ('投', 26778),\n", + " ('造', 26668),\n", + " ('服', 26660),\n", + " ('应', 26553),\n", + " ('商', 26546),\n", + " ('被', 26528),\n", + " ('格', 26437),\n", + " ('今', 26434),\n", + " ('界', 26408),\n", + " ('游', 26203),\n", + " ('议', 26120),\n", + " ('程', 25956),\n", + " ('接', 25880),\n", + " ('改', 25851),\n", + " ('专', 25816),\n", + " ('研', 25749),\n", + " ('教', 25693),\n", + " ('种', 25654),\n", + " ('调', 25616),\n", + " ('各', 25494),\n", + " ('装', 25332),\n", + " ('但', 25253),\n", + " ('如', 25127),\n", + " ('问', 24988),\n", + " ('件', 24561),\n", + " ('领', 24550),\n", + " ('单', 24509),\n", + " ('村', 24411),\n", + " ('办', 24316),\n", + " ('约', 24283),\n", + " ('无', 24199),\n", + " ('情', 23988),\n", + " ('江', 23945),\n", + " ('英', 23881),\n", + " ('超', 23861),\n", + " ('回', 23848),\n", + " ('价', 23835),\n", + " ('因', 23746),\n", + " ('打', 23537),\n", + " ('导', 23509),\n", + " ('7', 23455),\n", + " ('纳', 23389),\n", + " ('解', 23324),\n", + " ('由', 22875),\n", + " ('指', 22836),\n", + " ('处', 22769),\n", + " ('看', 22754),\n", + " ('去', 22665),\n", + " ('兰', 22624),\n", + " ('港', 22593),\n", + " ('周', 22566),\n", + " ('传', 22363),\n", + " ('季', 22280),\n", + " ('治', 22256),\n", + " ('流', 21957),\n", + " ('士', 21900),\n", + " ('环', 21874),\n", + " ('图', 21869),\n", + " ('支', 21807),\n", + " ('女', 21799),\n", + " ('团', 21758),\n", + " ('阿', 21690),\n", + " ('集', 21657),\n", + " ('些', 21565),\n", + " ('查', 21505),\n", + " ('施', 21486),\n", + " ('空', 21486),\n", + " ('常', 21421),\n", + " ('客', 21191),\n", + " ('然', 21137),\n", + " ('结', 21055),\n", + " ('农', 21023),\n", + " ('果', 21022),\n", + " ('深', 21015),\n", + " ('委', 20972),\n", + " ('四', 20893),\n", + " ('放', 20877),\n", + " ('续', 20865),\n", + " ('府', 20747),\n", + " ('园', 20570),\n", + " ('尼', 20483),\n", + " ('步', 20406),\n", + " ('普', 20395),\n", + " ('口', 20387),\n", + " ('房', 20374),\n", + " ('张', 20252),\n", + " ('认', 20252),\n", + " ('获', 20231),\n", + " ('医', 20167),\n", + " ('原', 19897),\n", + " ('风', 19890),\n", + " ('林', 19668),\n", + " ('9', 19539),\n", + " ('供', 19451),\n", + " ('米', 19444),\n", + " ('维', 19156),\n", + " ('河', 19093),\n", + " ('易', 19051),\n", + " ('让', 18991),\n", + " ('求', 18937),\n", + " ('老', 18832),\n", + " ('众', 18784),\n", + " ('党', 18783),\n", + " ('户', 18696),\n", + " ('股', 18692),\n", + " ('冠', 18687),\n", + " ('书', 18537),\n", + " ('少', 18531),\n", + " ('轮', 18521),\n", + " ('案', 18512),\n", + " ('身', 18484),\n", + " ('源', 18452),\n", + " ('8', 18415),\n", + " ('十', 18294),\n", + " ('境', 18276),\n", + " ('息', 18150),\n", + " ('航', 18139),\n", + " ('头', 18133),\n", + " ('演', 18127),\n", + " ('使', 18070),\n", + " ('很', 18023),\n", + " ('没', 17924),\n", + " ('观', 17900),\n", + " ('取', 17865),\n", + " ('花', 17858),\n", + " ('未', 17845),\n", + " ('协', 17760),\n", + " ('告', 17706),\n", + " ('证', 17683),\n", + " ('视', 17650),\n", + " ('龙', 17567),\n", + " ('非', 17559),\n", + " ('王', 17558),\n", + " ('引', 17541),\n", + " ('升', 17446),\n", + " ('亿', 17408),\n", + " ('样', 17403),\n", + " ('每', 17342),\n", + " ('融', 17341),\n", + " ('需', 17287),\n", + " ('费', 17284),\n", + " ('究', 17201),\n", + " ('营', 17178),\n", + " ('构', 17159),\n", + " ('准', 17056),\n", + " ('校', 16909),\n", + " ('别', 16887),\n", + " ('卡', 16854),\n", + " ('先', 16823),\n", + " ('师', 16757),\n", + " ('县', 16721),\n", + " ('李', 16670),\n", + " ('考', 16648),\n", + " ('助', 16592),\n", + " ('站', 16555),\n", + " ('域', 16528),\n", + " ('气', 16501),\n", + " ('色', 16442),\n", + " ('预', 16440),\n", + " ('变', 16392),\n", + " ('该', 16343),\n", + " ('权', 16305),\n", + " ('显', 16304),\n", + " ('型', 16299),\n", + " ('备', 16240),\n", + " ('护', 16218),\n", + " ('转', 16188),\n", + " ('列', 16133),\n", + " ('只', 16104),\n", + " ('走', 16046),\n", + " ('击', 15931),\n", + " ('模', 15924),\n", + " ('责', 15872),\n", + " ('整', 15811),\n", + " ('做', 15790),\n", + " ('给', 15697),\n", + " ('青', 15670),\n", + " ('快', 15616),\n", + " ('A', 15615),\n", + " ('双', 15611),\n", + " ('号', 15539),\n", + " ('俄', 15474),\n", + " ('苏', 15413),\n", + " ('届', 15399),\n", + " ('直', 15396),\n", + " ('势', 15339),\n", + " ('包', 15301),\n", + " ('习', 15237),\n", + " ('划', 15188),\n", + " ('质', 15162),\n", + " ('称', 15095),\n", + " ('媒', 15060),\n", + " ('互', 15052),\n", + " ('乐', 15025),\n", + " ('极', 14918),\n", + " ('消', 14908),\n", + " ('率', 14899),\n", + " ('监', 14878),\n", + " ('香', 14868),\n", + " ('条', 14847),\n", + " ('态', 14767),\n", + " ('类', 14673),\n", + " ('越', 14658),\n", + " ('论', 14615),\n", + " ('晋', 14570),\n", + " ('光', 14554),\n", + " ('古', 14543),\n", + " ('博', 14521),\n", + " ('警', 14482),\n", + " ('伊', 14482),\n", + " ('优', 14459),\n", + " ('飞', 14416),\n", + " ('积', 14333),\n", + " ('清', 14329),\n", + " ('验', 14319),\n", + " ('铁', 14290),\n", + " ('织', 14262),\n", + " ('防', 14248),\n", + " ('难', 14150),\n", + " ('儿', 14147),\n", + " ('武', 14115),\n", + " ('效', 14088),\n", + " ('形', 14060),\n", + " ('闻', 14004),\n", + " ('落', 13881),\n", + " ('确', 13809),\n", + " ('速', 13805),\n", + " ('洲', 13745),\n", + " ('男', 13723),\n", + " ('银', 13703),\n", + " ('反', 13672),\n", + " ('夫', 13495),\n", + " ('五', 13469),\n", + " ('始', 13446),\n", + " ('想', 13425),\n", + " ('策', 13424),\n", + " ('旅', 13406),\n", + " ('奥', 13404),\n", + " ('贫', 13363),\n", + " ('土', 13332),\n", + " ('试', 13326),\n", + " ('精', 13323),\n", + " ('或', 13316),\n", + " ('具', 13281),\n", + " ('席', 13270),\n", + " ('知', 13176),\n", + " ('感', 13175),\n", + " ('卫', 13154),\n", + " ('见', 13129),\n", + " ('较', 13117),\n", + " ('采', 13063),\n", + " ('访', 13042),\n", + " ('庆', 12920),\n", + " ('己', 12916),\n", + " ('白', 12848),\n", + " ('热', 12717),\n", + " ('群', 12707),\n", + " ('段', 12686),\n", + " ('艺', 12668),\n", + " ('革', 12646),\n", + " ('连', 12646),\n", + " ('景', 12616),\n", + " ('再', 12615),\n", + " ('历', 12595),\n", + " ('限', 12555),\n", + " ('才', 12525),\n", + " ('班', 12522),\n", + " ('话', 12514),\n", + " ('把', 12489),\n", + " ('望', 12446),\n", + " ('希', 12412),\n", + " ('根', 12378),\n", + " ('黄', 12365),\n", + " ('份', 12351),\n", + " ('范', 12335),\n", + " ('器', 12320),\n", + " ('边', 12264),\n", + " ('职', 12249),\n", + " ('她', 12240),\n", + " ('福', 12172),\n", + " ('星', 12124),\n", + " ('半', 12118),\n", + " ('注', 12104),\n", + " ('致', 12100),\n", + " ('干', 12035),\n", + " ('益', 11935),\n", + " ('复', 11891),\n", + " ('阳', 11860),\n", + " ('况', 11833),\n", + " ('朗', 11769),\n", + " ('友', 11755),\n", + " ('低', 11737),\n", + " ('湖', 11668),\n", + " ('馆', 11653),\n", + " ('纪', 11645),\n", + " ('负', 11636),\n", + " ('义', 11562),\n", + " ('火', 11537),\n", + " ('牌', 11521),\n", + " ('那', 11511),\n", + " ('练', 11494),\n", + " ('何', 11463),\n", + " ('功', 11405),\n", + " ('检', 11405),\n", + " ('波', 11401),\n", + " ('随', 11383),\n", + " ('值', 11350),\n", + " ('险', 11338),\n", + " ('批', 11328),\n", + " ('智', 11325),\n", + " ('降', 11297),\n", + " ('响', 11265),\n", + " ('介', 11239),\n", + " ('健', 11207),\n", + " ('韩', 11187),\n", + " ('排', 11132),\n", + " ('住', 11094),\n", + " ('容', 11084),\n", + " ('断', 11071),\n", + " ('塞', 10974),\n", + " ('养', 10963),\n", + " ('百', 10927),\n", + " ('仅', 10889),\n", + " ('存', 10861),\n", + " ('贸', 10859),\n", + " ('沙', 10794),\n", + " ('争', 10792),\n", + " ('则', 10792),\n", + " ('石', 10780),\n", + " ('春', 10766),\n", + " ('控', 10730),\n", + " ('字', 10709),\n", + " ('爱', 10694),\n", + " ('严', 10623),\n", + " ('承', 10622),\n", + " ('e', 10570),\n", + " ('镇', 10553),\n", + " ('评', 10538),\n", + " ('几', 10503),\n", + " ('核', 10454),\n", + " ('油', 10410),\n", + " ('陈', 10407),\n", + " ('曼', 10359),\n", + " ('史', 10349),\n", + " ('塔', 10333),\n", + " ('汉', 10315),\n", + " ('红', 10309),\n", + " ('购', 10299),\n", + " ('往', 10182),\n", + " ('识', 10152),\n", + " ('切', 10130),\n", + " ('余', 10077),\n", + " ('察', 10051),\n", + " ('货', 10042),\n", + " ('太', 10019),\n", + " ('宣', 10013),\n", + " ('么', 9988),\n", + " ('播', 9944),\n", + " ('坚', 9939),\n", + " ('C', 9927),\n", + " ('印', 9927),\n", + " ('宁', 9901),\n", + " ('财', 9895),\n", + " ('央', 9877),\n", + " ('哈', 9872),\n", + " ('象', 9872),\n", + " ('奇', 9865),\n", + " ('终', 9850),\n", + " ('病', 9832),\n", + " ('透', 9812),\n", + " ('a', 9811),\n", + " ('刘', 9798),\n", + " ('官', 9797),\n", + " ('右', 9785),\n", + " ('左', 9784),\n", + " ('副', 9738),\n", + " ('款', 9719),\n", + " ('围', 9711),\n", + " ('富', 9666),\n", + " ('真', 9639),\n", + " ('居', 9637),\n", + " ('依', 9610),\n", + " ('食', 9609),\n", + " ('训', 9596),\n", + " ('志', 9559),\n", + " ('远', 9557),\n", + " ('继', 9541),\n", + " ('神', 9458),\n", + " ('奖', 9442),\n", + " ('均', 9383),\n", + " ('言', 9374),\n", + " ('涨', 9372),\n", + " ('哥', 9334),\n", + " ('执', 9330),\n", + " ('许', 9312),\n", + " ('突', 9299),\n", + " ('兴', 9291),\n", + " ('幕', 9287),\n", + " ('甲', 9269),\n", + " ('币', 9251),\n", + " ('底', 9243),\n", + " ('票', 9219),\n", + " ('破', 9211),\n", + " ('声', 9190),\n", + " ('销', 9168),\n", + " ('层', 9165),\n", + " ('锦', 9159),\n", + " ('药', 9139),\n", + " ('澳', 9128),\n", + " ('B', 9125),\n", + " ('孩', 9123),\n", + " ('黎', 9114),\n", + " ('族', 9069),\n", + " ('竞', 9046),\n", + " ('秀', 9025),\n", + " ('乡', 9004),\n", + " ('满', 9000),\n", + " ('岁', 8975),\n", + " ('离', 8965),\n", + " ('拍', 8942),\n", + " ('除', 8912),\n", + " ('减', 8904),\n", + " ('测', 8849),\n", + " ('善', 8838),\n", + " ('雷', 8810),\n", + " ('即', 8792),\n", + " ('萨', 8777),\n", + " ('索', 8739),\n", + " ('康', 8720),\n", + " ('峰', 8710),\n", + " ('I', 8692),\n", + " ('o', 8684),\n", + " ('绍', 8670),\n", + " ('盟', 8670),\n", + " ('洛', 8654),\n", + " ('篮', 8632),\n", + " ('困', 8618),\n", + " ('湾', 8612),\n", + " ('售', 8603),\n", + " ('鲁', 8589),\n", + " ('端', 8562),\n", + " ('律', 8553),\n", + " ('料', 8542),\n", + " ('候', 8525),\n", + " ('稿', 8516),\n", + " ('库', 8513),\n", + " ('瓦', 8494),\n", + " ('置', 8490),\n", + " ('勒', 8449),\n", + " ('托', 8441),\n", + " ('幅', 8432),\n", + " ('P', 8422),\n", + " ('培', 8368),\n", + " ('云', 8348),\n", + " ('审', 8341),\n", + " ('享', 8333),\n", + " ('什', 8326),\n", + " ('亲', 8309),\n", + " ('埃', 8304),\n", + " ('录', 8288),\n", + " ('失', 8280),\n", + " ('陆', 8207),\n", + " ('迎', 8206),\n", + " ('故', 8203),\n", + " ('又', 8182),\n", + " ('诺', 8176),\n", + " ('蒙', 8172),\n", + " ('曾', 8145),\n", + " ('森', 8133),\n", + " ('命', 8122),\n", + " ('稳', 8113),\n", + " ('威', 8102),\n", + " ('迪', 8090),\n", + " ('便', 8089),\n", + " ('配', 8082),\n", + " ('括', 8073),\n", + " ('伦', 8072),\n", + " ('冰', 8067),\n", + " ('念', 8061),\n", + " ('编', 8035),\n", + " ('i', 8017),\n", + " ('略', 7998),\n", + " ('黑', 7957),\n", + " ('留', 7929),\n", + " ('微', 7928),\n", + " ('坦', 7911),\n", + " ('疗', 7908),\n", + " ('愿', 7904),\n", + " ('谈', 7866),\n", + " ('修', 7843),\n", + " ('请', 7829),\n", + " ('担', 7823),\n", + " ('初', 7820),\n", + " ('额', 7815),\n", + " ('午', 7796),\n", + " ('夺', 7793),\n", + " ('牙', 7711),\n", + " ('救', 7694),\n", + " ('促', 7685),\n", + " ('启', 7660),\n", + " ('础', 7602),\n", + " ('例', 7601),\n", + " ('密', 7565),\n", + " ('岛', 7524),\n", + " ('临', 7521),\n", + " ('算', 7479),\n", + " ('欢', 7476),\n", + " ('岸', 7470),\n", + " ('按', 7449),\n", + " ('伤', 7442),\n", + " ('早', 7439),\n", + " ('占', 7428),\n", + " ('船', 7421),\n", + " ('遇', 7398),\n", + " ('吉', 7397),\n", + " ('思', 7371),\n", + " ('音', 7370),\n", + " ('鲜', 7362),\n", + " ('汽', 7354),\n", + " ('津', 7346),\n", + " ('登', 7330),\n", + " ('祝', 7329),\n", + " ('剧', 7325),\n", + " ('你', 7323),\n", + " ('姆', 7299),\n", + " ('紧', 7288),\n", + " ('讯', 7283),\n", + " ('补', 7257),\n", + " ('板', 7255),\n", + " ('死', 7248),\n", + " ('室', 7243),\n", + " ('络', 7213),\n", + " ('某', 7209),\n", + " ('读', 7207),\n", + " ('雨', 7092),\n", + " ('探', 7089),\n", + " ('诉', 7076),\n", + " ('讲', 7072),\n", + " ('脱', 7061),\n", + " ('税', 7061),\n", + " ('停', 7043),\n", + " ('涉', 7026),\n", + " ('杯', 7000),\n", + " ('仍', 6985),\n", + " ('摩', 6983),\n", + " ('违', 6952),\n", + " ('像', 6928),\n", + " ('必', 6910),\n", + " ('N', 6890),\n", + " ('朝', 6873),\n", + " ('温', 6862),\n", + " ('签', 6859),\n", + " ('障', 6834),\n", + " ('帮', 6833),\n", + " ('田', 6829),\n", + " ('松', 6827),\n", + " ('丝', 6813),\n", + " ('拿', 6801),\n", + " ('属', 6753),\n", + " ('移', 6745),\n", + " ('买', 6728),\n", + " ('洋', 6715),\n", + " ('千', 6708),\n", + " ('画', 6689),\n", + " ('宝', 6676),\n", + " ('角', 6648),\n", + " ('乌', 6625),\n", + " ('债', 6621),\n", + " ('轻', 6617),\n", + " ('八', 6597),\n", + " ('亮', 6586),\n", + " ('送', 6571),\n", + " ('店', 6563),\n", + " ('泰', 6550),\n", + " ('瑞', 6525),\n", + " ('舞', 6520),\n", + " ('跌', 6503),\n", + " ('绿', 6489),\n", + " ('判', 6488),\n", + " ('素', 6438),\n", + " ('申', 6422),\n", + " ('川', 6415),\n", + " ('待', 6404),\n", + " ('t', 6401),\n", + " ('晚', 6393),\n", + " ('夏', 6387),\n", + " ('攻', 6362),\n", + " ('坛', 6350),\n", + " ('r', 6345),\n", + " ('盘', 6344),\n", + " ('独', 6343),\n", + " ('吸', 6343),\n", + " ('贵', 6339),\n", + " ('逐', 6338),\n", + " ('童', 6322),\n", + " ('杨', 6318),\n", + " ('袭', 6305),\n", + " ('充', 6271),\n", + " ('载', 6228),\n", + " ('厂', 6193),\n", + " ('语', 6187),\n", + " ('假', 6146),\n", + " ('莫', 6138),\n", + " ('扶', 6136),\n", + " ('座', 6134),\n", + " ('挥', 6131),\n", + " ('压', 6105),\n", + " ('丽', 6083),\n", + " ('典', 6072),\n", + " ('综', 6062),\n", + " ('另', 6061),\n", + " ('丹', 6053),\n", + " ('眼', 6046),\n", + " ('六', 6046),\n", + " ('敦', 6041),\n", + " ('梅', 6038),\n", + " ('盛', 6033),\n", + " ('短', 6000),\n", + " ('木', 5991),\n", + " ('滑', 5989),\n", + " ('射', 5963),\n", + " ('刚', 5949),\n", + " ('毕', 5929),\n", + " ('莱', 5916),\n", + " ('母', 5902),\n", + " ('守', 5872),\n", + " ('庭', 5862),\n", + " ('付', 5861),\n", + " ('跑', 5854),\n", + " ('良', 5848),\n", + " ('且', 5846),\n", + " ('菲', 5834),\n", + " ('乒', 5831),\n", + " ('尽', 5796),\n", + " ('街', 5782),\n", + " ('歌', 5753),\n", + " ('纽', 5752),\n", + " ('督', 5732),\n", + " ('桥', 5729),\n", + " ('它', 5698),\n", + " ('措', 5692),\n", + " ('冲', 5692),\n", + " ('止', 5678),\n", + " ('署', 5673),\n", + " ('简', 5655),\n", + " ('令', 5650),\n", + " ('植', 5647),\n", + " ('够', 5631),\n", + " ('券', 5607),\n", + " ('雪', 5598),\n", + " ('丰', 5594),\n", + " ('映', 5591),\n", + " ('驻', 5564),\n", + " ('顿', 5542),\n", + " ('S', 5536),\n", + " ('疆', 5534),\n", + " ('细', 5528),\n", + " ('沿', 5527),\n", + " ('绩', 5520),\n", + " ('弹', 5520),\n", + " ('派', 5516),\n", + " ('罪', 5514),\n", + " ('毛', 5510),\n", + " ('牛', 5507),\n", + " ('状', 5501),\n", + " ('免', 5492),\n", + " ('害', 5492),\n", + " ('彩', 5472),\n", + " ('输', 5448),\n", + " ('退', 5444),\n", + " ('斗', 5434),\n", + " ('亡', 5430),\n", + " ('郑', 5409),\n", + " ('钟', 5407),\n", + " ('贝', 5394),\n", + " ('频', 5377),\n", + " ('贷', 5370),\n", + " ('架', 5357),\n", + " ('版', 5330),\n", + " ('梦', 5319),\n", + " ('兵', 5318),\n", + " ('杜', 5298),\n", + " ('宫', 5295),\n", + " ('叙', 5280),\n", + " ('恐', 5279),\n", + " ('套', 5271),\n", + " ('汇', 5267),\n", + " ('蒂', 5259),\n", + " ('茶', 5252),\n", + " ('招', 5243),\n", + " ('悉', 5230),\n", + " ('圣', 5216),\n", + " ('换', 5196),\n", + " ('犯', 5158),\n", + " ('草', 5155),\n", + " ('s', 5131),\n", + " ('适', 5129),\n", + " ('激', 5120),\n", + " ('戛', 5108),\n", + " ('耳', 5095),\n", + " ('觉', 5053),\n", + " ('遗', 5047),\n", + " ('延', 5035),\n", + " ('毒', 5027),\n", + " ('疑', 5025),\n", + " ('皇', 5018),\n", + " ('析', 5014),\n", + " ('M', 5011),\n", + " ('惠', 5009),\n", + " ('聚', 4995),\n", + " ('爆', 4992),\n", + " ('追', 4981),\n", + " ('顺', 4980),\n", + " ('劳', 4974),\n", + " ('征', 4965),\n", + " ('否', 4963),\n", + " ('却', 4961),\n", + " ('课', 4955),\n", + " ('齐', 4942),\n", + " ('野', 4919),\n", + " ...]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_character_counts.most_common()[0:100]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEMCAYAAADK231MAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xl8VNX5+PHPkz0kEPYtC/u+74obFhVQEbSi4IpS0fbrVqut7betttVv219bt9YNFREVcK9ocasUcUEh7PsqIWFLQsi+J+f3x7nRIU3CJDOZLc/79cormTt37n3mZOaZM+ece44YY1BKKRW6wvwdgFJKqealiV4ppUKcJnqllApxmuiVUirEaaJXSqkQp4leKaVCnCb6FkREuojIahEpEJG/NdM5DorIBc1xbBX4RMSISF8fnGeuiHzR3Oep59zfPUcReUZEfuOPOBojJBK9k1xKRKTQ5ae7v+MKQPOBbKCNMeZnnh5MRBaJyEOeh9W8fJV8VOAQkQUisltEqkVkbj37fCwiF3lyHmPMbcaYP3hyDF8IiUTvmG6MiXf5OVJ7BxGJ8EdgAaQHsMPoVXLKQ0HwXtoM/ATYUNedIhIHjAE+82VQfmOMCfof4CBwQR3bewIGmAccAlY7288AvgJysS+ISS6P6YX95xcAnwD/AF5x7psEZNR3buwH5/3AfuAE8DrQvlYsNzqxZAP/63KccOBXzmMLgPVAMvAk8Lda53wPuLuespgIrAPynN8Tne2LgAqgHCisp7wWOef7lxPDN0Cfes4zv9bx3nMpj3uBLU4MrwEx9RxjLvAl8KjzvzjgxD8XSAcygRtd9k8AFgNZQBrwayDMua+v83/Lc8r2NWf7aqfci5w4r64nlluAnc7z3gGMdrYPAlY58W0HLqtVXk8BHzjH/hLoCjwGnAR2AaNqvVbuc8qmCHgB6OI8vgD4N9DOZf/LnHPmOjEMqnUsd8s5DRjj/H2dUx6Dnds/Av7p/B3txH7E+XkMiHZ97QO/AI4BLzvb7wOOOvvf7By7bz1x3ORSxgeAW13uqzn+z5z/+1HgJpf7OwDLgXxgLfAH4As3csMXwNw6tl8GLHf+fhD7Xl3sxLYdGNvAMb97js5r4CE3n0M08Ffs+/848AwQ65Mc6YuTNPuTOH2iXwzEAbFAIjYJX4xNzBc6tzs5j1kDPOL8U851/vHuJvq7ga+BJOfxzwJLa8XynBPHCKAM583rvGG2AgMAce7vAIx33kQ1Ca0jUAx0qeP5tscmmOuBCGCOc7tD7RdlPeW4CMhxzhkBvAosO83+D9VRHmuB7k48O4Hb6nn8XKASmwDCgYecN8GTTvld5JR/vLP/YuBdoLVTnnuAec59S4H/df6nMcDZdb0x64ljFnAYGOeUfV/st59IYB/2AzgK+IETzwCX55+NrRnGACuBb4EbXJ7Pf2qVzdfY5J6ITQYbgFHO810JPODs2x/7YXChE8fPnViimlDOi4GfOX8vwFYmfuxy30+dv3/vxNcZ6IStDP3B5bVfCfzZiTUWmIpNWEOx768lDZU1cAnQxynj87Cv49G1jv975/le7Nzfzrl/GTYZxznnO4xnif4ZnA8abKIvdc4ZDvwR+LqBYzaU6Bt6Do9hP6zaY1/D7wF/9EmO9MVJmv1J2Bd9Ibbmk8v3NZSezj+lt8u+v8Cpjbhs+whb005x/lFxLvctwf1EvxOY7HJfN2ytN8IlliSX+9cCs52/dwMz6nl+O4ELnb9vB1bUs9/1wNpa29bUvNBxL9E/73L7YmDXafavK9Ff53L7/wHP1PP4ucBel9vDnDLq4rLtBDDSeQOW4dREnftuBVY5fy/GJrGkOs5zukT/EXBXHdvPwdZew1y2LQUedHn+z7ncdwews9bzya1VNte63H4LeLrW42teu78BXne5Lwyb3CY1oZzn8X3tdSe2Fr/MuZ3G98l2P3Cxy+OmAAddXvvluHxrABYCf3K53f90ZV0rrn/WlLtz/BIgwuX+TOy373Ds+2igy33/h2eJPg1Idv5+EPi3y32DgZIGjtlQoq/vOQj2g7uPy31nAt+6U1ae/oRSG/1MY0xb52dmrfvSXf7uAcwSkdyaH+BsbFLuDpw0xhS57J/WiBh6AO+4HHcnUIWtwdU45vJ3MRDv/J2MfaPV5SXsV26c3y/Xs1/3OuJNw9Ye3VVnfCLyK5eO7meacox6HHf5uwTAGFN7Wzz2m0wUpz4/1+f2c+ybaa2IbBeRm08To6v6yr47kG6Mqa7nnHXFX1fsrtzd/5T/pRNDeq1zu1vOnwHniEhXbNJ8DThLRHpim8M21XVO52/XQQ1ZxphSl9vdOfW91eB7RUSmicjXIpLjvD8uxv5fa5wwxlTW8Zw6YStLbp/rNHEMA/KNMa7Hq12WMU3sh2joObQC1rvkhw+d7c0ulBJ9Q4zL3+nYGn1bl584Y8yfsG1q7ZyOmhopLn8XYf9ZAIhIOKf+o9KBabWOHWOMOexGjOnYr7V1eQWYISIjsG3G/6xnvyPYDxtXKdiaoEeMMf9nvu/ovq1ms6fHbYRsbK3O9fl999yMMceMMbcYY7pja/pPNWKkTX1lfwRIFhHX94lXytMNp/wvRUSwH0iNPrcxZh824dyJ7acqwCa2+dhacc0HWe3XT4qz7btD1Tr0UScm1/3rJCLR2G8wf8V+Y2sLrMB+OJ9OFvabtlvncsPF2H4oX8rGfpAPcckNCcaYhipBXtNSEr2rV4DpIjJFRMJFJEZEJolIkjEmDUgFficiUSJyNjDd5bF7sJ/0l4hIJLYzMNrl/meAh0WkB4CIdBKRGW7G9TzwBxHpJ9ZwEekAYIzJwHasvgy8ZYwpqecYK4D+InKNiESIyNXYr6HvuxlDYx0HejfTsU9hjKnCttE+LCKtnTK+B/v/RERmiUiSs/tJbFKqcjPO54F7RWSMU/Z9neN/g/1w/7mIRIrIJOzrYZmXn15dXgcuEZHJzmvtZ9imq6+aeLzPsM1+NaNMVtW6DbZZ6tfO67Yj8Fuc8m0gxrkiMlhEWgEPNLBvFPa9kgVUisg0bB/MaTn/+7eBB0WklYgMxja11st5/8ZgP0ginfd5Tb67BPte8Rnnw/Q54FER6ezEmCgiU3xx/haX6J2vazOwHWxZ2NrcfXxfFtcAE7Cdkg9g235rHpuHHbL1PLZmVYTtZa/xOLaz5WMRKcB2bE1wM7RHsG+cj7EjC17AdnjVeAnb5ltfsw3GmBPApdikcALbnHGpMSbbzRga6wVgsPNVtL5vGd50B7bMD2DbXpdg24nBdqR+IyKF2P/BXcaYb537HgRecuK8qvZBjTFvAA87xyvAfmNqb4wpx47OmIatkT0F3GCM2dU8T++UmHZjm+n+7px7OnYIcXkTD/kZtgNwdT23wXYep2JH8mzFdhTXe52EMeYDbAfjSmxH8coG9i3AfqN4HftBfA32/+Su27FNIMew7eIvnmb/j7E16InYvpsS4FwRScB+K27qB6YnfoEtp69FJB87ymqAL04sTqeAqoeIPIjteLnudPs2cxznYmtXPWu1GSul3OR80F9pjPmvD/xQ1uJq9MHI+ep+F3ZEjCZ5pZouF3vdRosS6Fe3tXgiMgj7dXozdry5UqqJjDEf+zsGf9CmG6WUCnHadKOUUiHO6003zhCmPwBtgFRjzEvePodSSin3uZXoRWQhdthepjFmqMv2qdghheHYjsI/YYcuJmKHJ2bUcbj/0rFjR9OzZ8/GRa6UUi3c+vXrs40xp7261t0a/SLsLI7fjSl3rgp9EjvpUgawTkSWY8eFrjHGPCsibwKfnu7gPXv2JDU11c1QlFJKAYiIW1NBuNVGb4xZja2huxoP7DPGHHAu4liGrc1nYC+IgO+vTFRKKeUnnnTGJnLqJEMZzra3gSki8ndOveruFCIyX0RSRSQ1KyvLgzCUUko1xJPO2LomIzLGmGLstKgNMsYsEJGjwPSoqKgxHsShlFKqAZ7U6DM4dTa5JE6d6e60jDHvGWPmJyQkeBCGUkqphniS6NcB/USkl4hEAbNp3CRFiMh0EVmQl5fnQRhKKaUa4laiF5Gl2JWKBohIhojMcybXvx27Os9O7Go42xtzcq3RK6VU83Orjd4YM6ee7SvwYF5nEZkOTO/b1931IZRSSjVWQMx1063vEHPzX5q2lkP7uCiuHJNEjw5xp99ZKaVCiIisN8aMPd1+fp29sqZGH9OtLx9tP3ba/euSW1zBP/6zj0n9O3HDxJ6c168TYWHurE6mlFItQ0DU6MeOHWuaemVsZn4pS9Ye4tVvDpFVUEbPDq247owezBqbTEJspJcjVUqpwOFujT7oE32N8spqPtp+jMVrDrLu4EliI8OZOSqRGyf2YGDXNt4JVCmlAkhQJHqXzthb9u7d67Xjbj+Sx8tr0vjnpsOUVlQzoVd7bpzYkwsHdyEyXGdmVkqFhqBI9DW8UaOvS25xOa+npvPy12mk55TQtU0M105IYfb4FDq1jvb6+ZRSypc00buoqjas2p3JS2vSWL0ni8hw4ZJh3bhxYk9GJrdFRDtvlVLBJygSfXM13TTkQFYhL3+dxpupGRSUVTI8KYEbzuzJpcO7ERMZ7pMYlFLKG4Ii0ddo7hp9XQrLKnln42EWf3WQvZmFtGsVyezxKVw7IYWkdq18GotSSjVFcCX6MaNM6pefNe3BETEQEdXkcxtjWHPgBIu/SuPjHXYs/wWDunDjxJ5M7NNBm3WUUgEruBJ993CTOj++aQ8Oi4Duo6Hn2fYneQJEN+1Yh3NLWPJNGkvXppNTVE7fzvHMHpdMcvtWtImJJCE2kjaxESTERhIXFaEXZiml/Cq4Ev3AFJO68OdNe3BRJqR9BYc3gKlyEv8o6HEW9DwHUiZAdOtGHbK0oooVW4/y0lcH2ZxR98yaYQKtayX/NjH2J6FVJG1inG0122MjSYiN+O629gcopTwVFIneq52xZYWQ/g0c/ALSvoTD66G6EiQcuo90SfxnQIz7F1AdyS0hp6ic/NIK8ksqyC+pJL+0grwSezuvpIL80kqXv+3v0orqBo8bFRFGQqz9oOiWEENSu1YktYslub3zu10rOsZHadORUqpeQZHoazRLZ2x5EaSv/T7xZ6RCdQVIGHQbcWrij23r3XMDZZVVFJRWNviBkF9SQW5xBUdyS8g4WcKJovJTjhETGUb3hFhax9pvCK1jImgdHWl/x0QSFRFGeBiEh4URESaEhQkRYUK4COFhQkS4ECZCq6hw4qMjiHceHx8TQXx0BFERevGYUsFME31t5cWQsc4m/YNf2L+rygGBbsOhh9PG3+NMiG3XvLHUo7i8koyTJWScLCY9x/4+kldKfkkFBaWVFJTW/K6kpMLzddejIsJo3yqKX148kBkjE73wDJRSvqSJ/nQqSmwtvybxp6+FqjJAoOtQW9vvcRb0mAit2vs2NjdUVlVTWW2orDZUVRmqjKGyuprqaqisrqaq2lDl3F9SXkVhmf2AKCyrpLC0wt4uq+SbAzlsSs/l3ov68z/n99WmIqWCSFBMU+xXkbHQ6xz7A1BRatv1076Eg59D6kL4+ilAIHk89J8KA6ZBp4EQAMkwIjyMCC/055ZVVnH/W1v568d7OJRTzMOXD9P5gJQKMS23Rn86lWV2JM+BVbDnQzi6yW5v28Mm/P5TbY3fgzH8gcIYw6Of7OGJlfvo0aEVM0YmMnNkd3p3auKQV6WUTwRF040/pkBosvwjsOcjm/QPrILKUohuA31+YBN/v4sCsomnMT7cZqd5XnPgBMZA745xDE9KYFhSWy4e1pVuCbH+DlEp5SIoEn2NgKzRN6S82Knpf2CTf+FxO5onecL3TTwd+wdEE09THMsr5b3NR1h7MIetGXkcyy8lMlz44egk5p3di14d44jQ5h2l/E4Tva9UV8PRjbD7Q5v4j22129v1gkHTYfBMSBwdtEkfIO1EEc9//i2vpaZTXllNmECXNjF0bxtLt4QYLhjUhRkju2tHrlI+poneX/IybPPOrhXw7Wf2oq2EZBg8w/4kjoWw4KwNZ+aX8umuTI7klnAkt5QjuSUcyinmcG4JM0Z25+HLhxEf3XL795XyNU30gaDkJOz+AHa8C/tX2nH7bRJh0GU26SdPCNqkX6Oq2vD0qn088skeuraJ4e4L+nPF6ERt2lHKBzTRB5rSPNu8s+Nd2PdvO2Y/visMvgyG/hCSxgd10l+flsPv39vB5ow8YiLDaNcqioTYSNq2iiQqIpzoiDD6dY6nb+d4OreOoXObaDq3jiYhNlKbfJRqIr8lehGZBPwB2A4sM8asOt1jWkSid1VWYDtxd7wLez+ByhJISIFhP4Rhs6DLEH9H2CTGGD7ZcZxvvs0hz5neIa+knPIqQ3FZJQeyi6iqPvX1FhUeRqfW0Qzu3oYzencgqV0skwZ0ItobFwkoFeK8muhFZCFwKZBpjBnqsn0q8DgQDjxvjPmTiJwH3A8cBx4yxuw73fFbXKJ3VVYIu1fA1jdg36d2Bs7Og2HYlTD0SmjXw98Rek1pRRVH80rJzC8ls6DM+SklM7+Mtd/mcDi3BIDuCTEMTUwgJjKcbm1juHRYd4YlJfg5eqUCj7cT/blAIbC4JtGLSDiwB7gQyADWAXOAXcaYahHpAjxijLn2dMdv0YneVVE2bH8Htr4J6V/bbckTbC1/8EyI7+Tf+JqRMYaconK2HM7jxS8PkplfSmlFFYdzSxCEZbeewegU/8xBpFSg8nrTjYj0BN53SfRnAg8aY6Y4t38JYIz5o3M7ClhijLnydMfWRF+Hk2mw7S2b9DO32+mW+5xvk/7ASxo9x36wOlFYxsynvqSgtJJfTRvElWOSdMEXpRy+SPRXAlONMT9ybl8PTABWAlOAtsDT9bXRi8h8YD5ASkrKmLS0NLfiaJGOb7cJf+ubkHcIImJhwFQYPhv6XgDhoT2k8dvsIu57YzOpaSfp3TGOLm1iiIsOJzYqgriocHp2jKNPp3gm9ulAnA7vVC2ILyY1q6taZYwxbwNvn+7BxpgFInIUmB4VFTXGgzhCX5ch9mfyb+0sm1vfgO1v22aeuM4w/CoYeU3QduKeTq+Ocbxx25m8veEw7205QlFZJUfzKigut3P+ZxeWARAbGU5y+1gGdm1DQmwkXRNiOK9/J4Z0b6Mje1SL1mxNN42hTTdNUFVhR+xsetWO4KmugK7DYeS1tnknroO/I/SZ7MIy9hwr4OMdx/k2u4i0E0XklVRwsrgCgM6toxnYzSb/Xh3jiIkMIzoinNEpbRml7f4qiPmi6SYC2xk7GTiM7Yy9xhizvRFBBs+kZoGs6ARse9Mm/aOb7bq5/afaWn7fC0Nihs2myC4sY9XuLFbtziQ9p5isgjKO5JV+d78ITBnclalDu9KlTQzJ7WPpnhCrfQAqaHh71M1SYBLQETts8gFjzAsicjHwGHZ45UJjzMNNCVZr9F50fDtsWgJbXrcLp7fqAMOugpFzbI2/hTdhVFcbyquqOVlczrOfHeDN9RkUllV+d39MZBjd28YyoEtrbpzYkzN6t5xvRir4BMWVsVqjb0ZVlbD/U5v0d6+w0y90HmJr+cOvgvjO/o4wIJSUV3E4t5hjeWUcyinmQFYhR/JKWPvtSbILy7hgUGeGJibQIT6alPatOKN3e72YSwWMoEj0NbRG38yKc2zn7aYldhWtsAg7lfLoG+18+mGauGorLKvk75/u5bXUdHKdtn6A+OgIzh/YmQFd4hmd0o4zenfQph7lN0GR6LVG7wdZu2Hjy7BpKRRn20nWRl1nO3FD6Cpcb6pwmnp2HMnnw23HWLkrk8wCO9KnVVQ4g7u1YUj3NvTv2ppxPdvTv0vLuMZB+V9QJPoaWqP3g8pyO3/+hsV26gWA3pNg9A32gqyIaH9GF/AKSiv4cNsxth/JZ+vhPHYdzaeovAqAkcltObd/J4YnJjChd3tax0T6OVoVqjTRK/flptsROxtfgbx0iG0PI+bA6Ouh8yB/RxcUjDEcPFHMOxsPs3pPFpszcjEGOsRFcenwbvTqGMdFQ7rSva0ux6i8JygSvTbdBJjqKjjwH1vL37XCjs1PGm9r+UMuh2hdLNxdBaUVbE7P45FPdrP7WAFF5VXERIYxrmd7RiW3ZeaoRHp1jNMLuZRHgiLR19AafQAqzIIty2zSz94DUfF23vzRNwb90oi+Zoxh17ECXvk6jY2Hctl1LJ9qYy/kumpsMjNHdadvZ23XV42niV55hzF22oUNi+3InYpiO43y6Btg+NXQqr2/Iww6R3JL+PfO46zancXKXZkAJLeP5aLBXTm7X0fO6tORqIjgXYRG+U5QJHptugkypfl2Rs0Ni+HIBgiPtk06Y2+y0ylrLb/RMvNLWbH1KCt3Z/H53iyMgeiIMGaNTeK+KQNJiNWOXFW/oEj0NbRGH4SObYXUF+0VuOUFtpY/5iYYcTXE6CIhTVFQWsE3B3JYtu4Q/96ZSUSYMG1YN87o3Z7JA7vQNSHG3yGqAKOJXvlGWaGdZyf1RTi6CSJbwdArYOzN0F3b8pvCGMOGQyd5e8Nhlm86QkFZJVERYZzZuwMT+3RgdI92jExuS6QuwN7iaaJXvnd4A6x/0c6bX1Fs59YZe5OdTbOFLJTibcYYdh4t4PXUdL7Yl82+zELATsncp3Mc88/tw5QhXXRahhZKE73yn9I826SzfhEc32ZH7AybZZN+txH+ji6oZRWUsT4th9V7s1m5M5Nj+aXER0cwfUR3/uf8PiS1a+XvEJUPBUWi187YEGcMZKRC6kI7YqeyFBLH2Lb8oVdAVJy/Iwxq1dWGz/Zm8f7moyzffJjKasPkgV2YNTaJ8/p3IiZSa/mhLigSfQ2t0bcAJSdh82s26WfvhugE23E75iboMtjf0QW9g9lFLPj8AMs3HaGwrJK4qHCuHJPEvVMG6BQMIUwTvQpMxsChNTbh73jXTp+cfIZt1hk8AyJ1igBPlFZUsT7tJP/ceJi3NmTQPi6K2eNSuOXc3jpUMwRpoleBr+gEbF5iR+zk7IfYdnYWzbE3Q4c+/o4u6G08dJLHP93Lqt1ZdIyP4pZzenPlmCQ6xOuEdaFCE70KHsbAt6sh9QXY+T6YKugzGcb9CPpP0fnyPfTNgRP8+cNdbDiUS2S4MGVIV+ZO7MnYnnpVc7DTRK+CU/5R2PCSHbFTcBQSkmHMXDvlgq6K1WQ1wzTfXJ/Bm+vTyS+t5MHpg5k9PkU7bYNYUCR6HXWj6lVVAbs/gHXPw7efQVikbcMfNw9SztQLsTxQUFrB/yzZyOo9WbSOieCa8SnMGptM3846O2mwCYpEX0Nr9KpB2Xtt5+3GV6Esz063MG6enVRNL8Rqkupqw5oDJ3h5TRqf7DxOVbVhQq/23DW5H2f26aDTJwcJTfQq9JQX2UnV1j4Hx7bYC7FGzIax83SIpgeO55eyfNMRnlq1j5PFFQzs2pqfTx3A+QM6a8IPcJroVegyxi5yvu552PY2VJVBj7NsLX/gdIiI8neEQam0oor3txzliU/3ciinmJHJbbnrgn6c16+TLoAeoDTRq5ah6ARsesU27Zw8CHGdbcftmLnQNtnf0QWliqpqXk9N5++f7uNYfikjkhL466wR9NNFzwOOJnrVslRXw/6Vtpa/50PbWdt/mq3l9z4fwnSmx8Yqq6xi+aYj/O69HRSWVTJtaFd+felgEnXd24Dh10QvInHAauABY8z7p9tfE73yqpNpdnjmhsVQnA3te9t2/JHX6IpYTXA0r4RFXx7k2dUHiAgT5k7syS+mDdRpkgOAVxO9iCwELgUyjTFDXbZPBR4HwoHnjTF/crb/HigCtmuiV35TWQY737O1/ENrICLGrns7bp6dXE01StqJIp5etZ9l69Lp1TGOey7sz/QR3f0dVovm7UR/LlAILK5J9CISDuwBLgQygHXAHKA70BGIAbI10auAcGybvfJ282tQUQTdR8G4W+wsmjq/TqP8e8dxHv33HrYfyecHAzvzpx8Oo3NrXf3KH7zedCMiPYH3XRL9mcCDxpgpzu1fOrvGA3HAYKAEuNwYU93QsTXRK58pzYctr9laftYuO7/O6Bts0067Hv6OLmhUVFXz4pff8pePdtMmJpJZY5O59dzetIvTEU++5ItEfyUw1RjzI+f29cAEY8ztzu25NFCjF5H5wHyAlJSUMWlpaW7FoZRXGAMHP7dj8nf9C0w19J8K42/RzttG2HjoJI98socv9mXTKT6aW8/rw3VnpOiKVz7ii0Q/C5hSK9GPN8bc0YggdQoE5X95h+0SiOsXQVEWtO9jJ1QbeQ3EtvV3dEFhfdpJ/vzBLtYezKF3xzh+efEgLhikF1w1N7813Rhj/tjYYLXpRgWEyjLYsRzWLoCMtXah8+FX21p+lyH+ji4orNqdye/f28GB7CLO7N2B388YouPvm5EvEn0EtjN2MnAY2xl7jTFmeyOC1Bq9CkxHNsG65+xC55WlzpW3P4JB0yFcF/BoSE37/dOr9lNRZfjDzCFMH96dCB2O6XXeHnWzFJiEHU1zHDs+/gURuRh4DDu8cqEx5uGmBKs1ehWwinNg4yu28zY3DeK72tWwxsyF1l39HV1AO5Jbwo9eSmXH0XzO69+JJ+aM0lWuvCworozVGr0KGtVVsO/ftvN23ycQFgGDLoPx8yHlDJ02uR5V1YZnPtvPo5/sIbFdLE9fO4bB3dv4O6yQERSJvobW6FVQObHfmTb5ZSjNgy5DbTv+sFkQFefv6ALS+rQcfvLqBnKLK3j48mFcOSbJ3yGFhKBI9FqjV0GtvAi2vgFrn4fjWyE6AUZdZ6+81TVv/0t2YRl3LNnImgMnuGZCCr+9dLCubuWhoEj0NbRGr4KaMZD+jR2ts+NdqK6EvhfYK2/7Xahr3rqorKrmrx/v4ZnP9tOzQyv+dtVIxvRo5++wgpYmeqX8oeAYrH/JNu0UHoO2PWwNf9T1OqGaiy/2ZvOLt7aQWVDKwzOHMWtsko65b4KgSPTadKNCVlUF7Hrfdt6mfelMqHalbcvvPtLf0QWEvOIK5r+cyjff5jB7XDK/mzFEr6htpKBI9DW0Rq9C2vHtNuFveQ0qiiFpnB2tM3gGRET7Ozq/qqyq5nfv7eDlr9Phf4cjAAAXQ0lEQVQY26MdL940jtYxOgTTXZrolQo0JbmwealN+jn7Ia4TjL7RjstPaNmjUJZvPsLdyzZyVt+OPHXtaE32bgqKRK9NN6pFqq6GA/+xF2Ht/gAkDAZebGv5Pc9psWPyX09N5/63tnBW3468OHecXknrhqBI9DW0Rq9arJNptuN2w2IoyYFOA+1UCyNmQ3TLmyNm6dpD/PLtrdx8Vi9+O32wv8MJeO4mev3IVMqf2vWAC38H9+yAmU/bRVBW3At/GwT/uheydvs7Qp+aMz6FuRN7svDLb3n2s/3+DidkRPg7AKUUNsGPvMb+ZKy3Y/I3vGQnVut1Hky41c6X3wLG5P/20sFkF5bxxw920bdzPJMHdfF3SEFPm26UClSFWTbZpy6E/MOQkALjbrYduCE+Jr+0ooppj39OdkEZC24Yy5l9Ovg7pIAUFG302hmrlBuqKmH3ClvLP/j592PyJ8yHbiP8HV2zOXSimHkvreNIbglPXjuaSQM6+zukgBMUib6G1uiVctPxHbY5Z/MyOyY/eYIdrTPoMogIvfVaj+aVcNOL69ibWcgjV41gxshEf4cUUDTRKxXKSnJh0xKb9HMOQHwXGOPMk9+mm7+j86rCskpuXLiWTem5/G3WCGaO0mRfQxO9Ui1BdTXs/9Q26+z92M6TP3iGreUnTwiZMfnF5TbZr087yWOzR3HZiO7+DikgaKJXqqU5sR/WvWBXxCrLg67DbcIfdqUd1RPkCssque75b9h+JI/lt5/NoG66gIkmeqVaqvIiO6/O2ucgcwfEtoPRN8DYeXbcfhA7UVjGlMc+Bwzv33EOXRNi/B2SXwXFBVMiMl1EFuTl5fkzDKVCS1QcjL0ZfvwV3Pi+nVbhq3/A4yNg6RzY/x87h34Q6hAfzZJbJlBUVsVPX9tERVW1v0MKClqjV6olyMuw4/HXL4LiE9Cxv23WCdKpFl5bd4hfvLWVy0cl8rdZIwgLC42+iMYKihq9UspHEpJg8m/hpzvg8mchKv77qRZW3AfZwXUdy9XjUrj7gn68s/Ewf/24ZU0T0RQ6BYJSLUlkjK3Fj5jtTLXwrK3lr10Avc+3tfz+U4JiqoW7JvfjeH4pT63aT/e2sVx3RnD3PzQnbbpRqqUrzHTm1VkIBUegbYqdQTMIlj+srKpm/svr+c/uTJ69bgwXDenq75B8SkfdKKUap6oCdv3L1u5rlj8cNsvW8rsN93d09Sosq+Ta575mb2Yhz14/hnP6dfJ3SD7jt0QvIoOAu4COwKfGmKdP9xhN9EoFmGPbnKkWXoPKEkg50653O+gyCA+81Z8y80u5/oW1HDxRxCc/PY+UDq38HZJPeLUzVkQWikimiGyrtX2qiOwWkX0icj+AMWanMeY24CrgtAEopQJQ16Ew/XH42U646GEoOApv3gyPDoVVf4aC4/6O8BSd28Sw4IYxVBvDvW9splKHXZ7C3VE3i4CprhtEJBx4EpgGDAbmiMhg577LgC+AT70WqVLK92LbwcTb4Y6NcM3r9gNg1f/Bo0PgrR9B+tqAGZPfo0Mcf7piOGsP5vDQv3b6O5yA4laiN8asBnJqbR4P7DPGHDDGlAPLgBnO/suNMROBa70ZrFLKT8LC7Gic696C29fbzto9H8ELF8KC82Djq1BR4u8ouWJ0InPGp7Doq4O8vOagv8MJGJ6Mo08E0l1uZwCJIjJJRJ4QkWeBFfU9WETmi0iqiKRmZWV5EIZSyqc69oVpf4J7dsIlf4PKMnj3J/DIYPjkAcg95LfQRISHZg7l7L4d+d17O1h3sHb9tGXyJNHXdSmaMcasMsbcaYy51RjzZH0PNsYsAH4HbIiKCr15tJUKedHxtmb/k6/hxvegx0T46gk71cKya+HAZ35p1gkPEx6bPZLk9q24fckGcovLfR5DoPEk0WcAyS63k4AjjTmAMeY9Y8z8hIQED8JQSvmVCPQ6F2a/CndtgbPugrSvYPFl8OQEO7laWaFPQ+oYH80Ts0eRVVDGI5/s8em5A5EniX4d0E9EeolIFDAbWN6YA+ikZkqFmLbJcMGDtlln5tN2euQV98Ijg+CDX0D2Pp+FMiwpgevP6MErX6exKT3XZ+cNRG6NoxeRpcAk7Nj448ADxpgXRORi4DEgHFhojHm4KUHoOHqlQpQxkJFqp1rY/k+oroA+k+1FWP0ubPapFvJKKpjy6GoiwoXlt59N+7jQaiYOiitjdXFwpVqQguN2qoXUhXZcfruezlQL19lhnM1kU3ouVz27hjEp7Vg8bzyR4aEzl2NQJPoaWqNXqgWpqoCd79m2+0NfQUQsDL/K1vK7Dm2WU761PoOfvbGZOyf3454L+zfLOfwhKKYp1jZ6pVqg8EgYegXc/AHc+jkMnwVbXodnzoKF02D7O/bDwIt+OCaJK0Yn8veVe/lkR2Bd1esLWqNXSvlfcY5d63bdc3YcfuvudpWsMTdCfGevnKKkvIrLn/qS7MJyPrjrHDq1jvbKcf0pKGr0SikF2OmQz7oT7twEc5ZB54Hwn4fsRVhv3WI7dD2slMZGhfPo1SPJKynnJ6+up6ra/5VcX9GmG6VU4AgLhwHT4Pp34PZUW6vf/QE8PxmeOx82LYWK0iYfflC3Njw0cyjrDp7k/324y4uBBzZtulFKBbayAti8zM6Tn70HWnWA0TfCuHl2icRGMsbwq3e2sXTtId647UzG9QzsxVUaoqNulFKhxRj49jP4ZgHs+cBuG3iJHa3T8xx7ha6bissrOe8vq0hsG8s7P5mINOKxgSQo2ui16UYp5TYR6D0J5iyxbfkT74SDX8BL0+GpM2HdC25PtdAqKoJ7LuzPpvRcVu0O/UkVtUavlApeFSWw7S345lk4tgWiE2DUtfZCrA59GnxoeWU1kx9ZRWR4GB/dfW5QXkgVFDV6pZTySGSsvbL21tVw88d2WoW1C+Dvo+GVK2HPx1Bd92pTURFhPHDpEA5kFbHoy4O+jdvHNNErpYKfCKRMgCtfgJ9uh0m/tDX8JbNs0l/zJJT898Rmkwd15rz+nXj033vILizzQ+C+oYleKRVaWneFSffD3dvghy9AfBf46Fd2Bs337obj27/bVUT49SWDKKmo4rnVB/wYdPPSzlilVGiKiIJhV8K8j2D+Z3bahc1L4emJsOhS2PEuVFXSr0trLhvRnRe/OsjhXP8vh9gctDNWKdVyFOfAhsV2hE7eIWiTCGNv4mjfqznnH9uYNTaJP14x3N9Ruk07Y5VSqrZW7eHsu+GuTTB7KXTsBysfotsLY3i9y0tsX7eK9Jxif0fpdVqjV0q1bFm7Ye1zVG9aQlhFERlxQ0i66C4YMhMiAnviM63RK6WUOzoNgEv+StjPdvFax9upKMyBd+bDo0Ng5UOQ36ilsAOSJnqllAKIacPYq3/JD8r+wluDHofEMbD6r/DYMHhjrl3wPABaQJpCR90opZSjT6d4fjCwKw/s6Ebe5a/AnRtgwm2wfyW8OA2eOQfWvwTlwdWO79dEb4x5zxgzPyEhwZ9hKKXUd35yfl8KyypZ8s0haN8bpjwM9+yE6Y+DqYb37rRj8j/+NZw86O9w3aJNN0op5WJMj3ac2bsDL375LRVVzvQJUXEwZi78+EuYuwJ6nwdrnoLHR8KS2bbGH8DNOprolVKqlvnn9SazoIzFa9JOvUMEep4FVy2Gu7fCOT+DjHXw8uXwj3F2CuWyAv8E3QBN9EopVcuk/p04o3d7/r5yL3kl9SxUnpAIk38D9+yAy5+F6NbwwX3wt0Gw4j7I3uvboBugiV4ppWoREX4xdSB5JRXMfXFtwztHRMOI2TD/P/CjlTDwYli/CP4xFhbPtEshVlf5JO76aKJXSqk6jEppx6T+ndh4KJeNh06696CkMXDFAjuD5vm/thdjLZ0NT4yCL5+wUzD4QbMkehGZKSLPici7InJRc5xDKaWa20OXDwPgjysauZB4fGc47z64ewvMWmTXtv3kN/DIYFh+Bxzb6v1gG+B2oheRhSKSKSLbam2fKiK7RWSfiNwPYIz5pzHmFmAucLVXI1ZKKR9JbBvL0MQ2rD2YQ8bJJoydD4+EIZfDTSvgti9g+FWw5Q145mxYOA22vQ1V9fQBeFFjavSLgKmuG0QkHHgSmAYMBuaIyGCXXX7t3K+UUkHpd5cNBeDuZZs8O1DXYXDZE7bz9qKHIP8wvHkTpC70QpQNczvRG2NWA7UbmMYD+4wxB4wx5cAyYIZYfwY+MMZsqOt4IjJfRFJFJDUrK/QX51VKBacxPdrRo0MrUtNOciyv1PMDtmoPE++AOzfCnNdg2CzPj3kanrbRJwLpLrcznG13ABcAV4rIbXU90BizwBgz1hgztlOnTh6GoZRSzefB6UMAeOhfO7x30LBwGDDVJv5m5mmilzq2GWPME8aYMcaY24wxz9T7YJ3rRikVBM4f2Jm+neN5f8vRoJyv3tNEnwEku9xOAoJ/Tk+llKrlzz+0K0/96h3fjpjxBk8T/Tqgn4j0EpEoYDaw3N0H66RmSqlgMaZHO9rERPD53mxOFpX7O5xGaczwyqXAGmCAiGSIyDxjTCVwO/ARsBN43RizvaHj1DqmNt0opYJGzXqyP39ri58jaRxdSlAppRqh3/+uoKLKsO13U4iPjvBrLLqUoFJKNYOHZtpx9U+v2ufnSNynK0wppVQjzBpjx5889/m3fo7EfbrClFJKNUJYmHDBoM6UV1az7XBwVFK1Rq+UUo30i6kDAfjHyuBovtEavVJKNVK/Lq0B+HD7MT9H4h7tjFVKqSaYO7EnAB9sPerfQNygiV4ppZrgzsn9APjrx7v9HMnpaRu9Uko1Qfu4KPp3iWd/VlHT5qr3IW2jV0qpJvrNpXb5jUc+2ePnSBqmTTdKKdVE5/SzU6y/veEwgTDLQH000SullAeuGJUIwAZ3FxD3A22jV0opD9x6Xh8A/ufVjX6OpH7aRq+UUh4Y0LU1Se1iOZZfyoGsQn+HUydtulFKKQ89ctVIAP7yUWAOtdREr5RSHhrfqz0i8MG2Y1RXB16nrCZ6pZTygmsnpACwfHPgraaqiV4ppbzg3osGAPDiVwf9G0gddNSNUkp5QdtWUXRLiGFzeq6/Q/kvOupGKaW8ZPKgzgBsyQisZK9NN0op5SVXOqtPvbYu3c+RnEoTvVJKecmwRNs68f6WwJq6WBO9Ukp5SXiYcF7/TuSVVFBaUeXvcL6jiV4ppbzo3P52orNVu7P8HMn3NNErpZQXTRnSBYA312f4OZLveT3Ri0hvEXlBRN709rGVUirQJbVrBcDqvUFWoxeRhSKSKSLbam2fKiK7RWSfiNwPYIw5YIyZ1xzBKqVUMJg7sSflldVsDJCpi92t0S8CprpuEJFw4ElgGjAYmCMig70anVJKBaEZI7sD8O6mwJgOwa1Eb4xZDeTU2jwe2OfU4MuBZcAMd08sIvNFJFVEUrOyAucrjlJKeWqoM8xyf4BMW+xJG30i4HpVQAaQKCIdROQZYJSI/LK+BxtjFhhjxhpjxnbq1MmDMJRSKrBEhocxNLENn+/NDohhlp4keqljmzHGnDDG3GaM6WOM+WODB9C5bpRSIeqsvh0B2Hk038+ReJboM4Bkl9tJQGA0SCmllJ9N6m/nvVnyzSE/R+JZol8H9BORXiISBcwGljfmADqpmVIqVJ3ZpwOdWkfz2R7/90G6O7xyKbAGGCAiGSIyzxhTCdwOfATsBF43xmxvzMm16UYpFcpiI8PJLCgju7DMr3G4O+pmjjGmmzEm0hiTZIx5wdm+whjT32mPf7ixJ9cavVIqlN01uR8AJwrL/RqHToGglFLNpEN8FAAPLN92mj2bl64wpZRSzeSsvh2JDBfKK6v9GoeuMKWUUs0kMjyMSQM6s+FQLvmlFX6LQ2v0SinVjDq3jgbgw63H/BaD1uiVUqoZ3TdlAACFZZV+i0E7Y5VSqhnFRIYDcDi3xG8xaKJXSqlmFB1h0+wLX3zLoRPFfolB2+iVUqoZiQh3OuPpswpL/RKDttErpVQzO7N3BwDKK41fzq9NN0op1cyiIuxkv+VV/hlPr4leKaWaWXSE7ZC9743NbPDD8oLaRq+UUs2sf5fWXH9GDzILyth+2Pf5TtvolVKqmUVFhHHPhf0BqKjyfTu9Nt0opZQPRITbdvqqak30SikVkiLDbbqtqPZ9h6wmeqWU8oGIMFujr9SmG6WUCk3hYYIILFh9gB+/st6n59ZRN0op5QMiwv9ePIjubWNY7eN1ZHXUjVJK+ciPzunNpAGdqTK+bb7RphullPIhEfD1wBtN9Eop5UPhIlT7ONNroldKKR8KDxNtulFKqVAWJoIxYHyY7DXRK6WUD4WH+f4K2QhvH1BE4oCngHJglTHmVW+fQymlgtV3id4Y7yfgerhVoxeRhSKSKSLbam2fKiK7RWSfiNzvbL4CeNMYcwtwmZfjVUqpoCY2z+PLZnp3m24WAVNdN4hIOPAkMA0YDMwRkcFAEpDu7FblnTCVUio0hDuZ/r43t3DPa5v45sCJZj+nW4neGLMayKm1eTywzxhzwBhTDiwDZgAZ2GTf4PFFZL6IpIpIalaWb68SU0opfxmZ3JbeHePYlH6SdWk5ZBeWN/s5PWkiSuT7mjvYBD8BeAL4h4hcArxX34ONMQuABQBjx471z0KKSinlYxN6d2DlvZN8ek5PEr3Usc0YY4qAm9w6gMh0YHrfvn09CEMppVRDPBlemQEku9xOAo54Fo5SSilv8yTRrwP6iUgvEYkCZgPLG3MAndRMKaWan7vDK5cCa4ABIpIhIvOMMZXA7cBHwE7gdWPM9sacXKcpVkqp5ie+vAy3PmPHjjWpqan+DkMppYKKiKw3xow93X46BYJSSoU4XWFKKaVCnK4wpZRSIc5Xc+rUqWYcPZAvInudzQmAaxW/9u3a2zoC2c0YZl3n9/ZjG9qvsfe5s6327UAtQ2+UX0P3a/m5t19jyq+u7foebr73cA83YrNzIgfSD7Cgodu1twGpvoynOR7b0H6Nvc+dbXXcDsgy9Eb5NXS/lp/3y8+d8qq9LVDLz1tl6Iv38Ol+ArEztva0CXVNo1Dv1ArNwJNzufvYhvZr7H3ubPNl+XlyPm+UX0P3a/m5t19jyq+u7foebtx9Xn8NBsTwSk+ISKpxY3iRqp+WoWe0/Dyj5df8ArFG31gL/B1ACNAy9IyWn2e0/JpZ0NfolVJKNSwUavRKKaUaoIleKaVCnCZ6pZQKcSGX6EUkTkReEpHnRORaf8cTbESkt4i8ICJv+juWYCUiM53X37sicpG/4wk2IjJIRJ4RkTdF5Mf+jicUBEWiF5GFIpIpIttqbZ8qIrtFZJ+I3O9svgJ40xhzC3CZz4MNQI0pP2PXAJ7nn0gDVyPL8J/O628ucLUfwg04jSy/ncaY24CrAB126QVBkeiBRcBU1w0iEg48CUwDBgNzRGQwdqWrmrVsq3wYYyBbhPvlp+q2iMaX4a+d+1Ujy09ELgO+AD71bZihKSgSvTFmNZBTa/N4YJ9TAy0HlgEzsEscJjn7BMXza26NLD9Vh8aUoVh/Bj4wxmzwdayBqLGvQWPMcmPMRECbX70gmBNhIt/X3MEm+ETgbeCHIvI0vr9UPZjUWX4i0kFEngFGicgv/RNa0KjvNXgHcAFwpYjc5o/AgkR9r8FJIvKEiDwLrPBPaKHFr7NXekjq2GaMMUXATb4OJgjVV34nAE1O7qmvDJ8AnvB1MEGovvJbBazybSihLZhr9BlAssvtJOCIn2IJRlp+ntMy9IyWn48Ec6JfB/QTkV4iEgXMBpb7OaZgouXnOS1Dz2j5+UhQJHoRWQqsAQaISIaIzDPGVAK3Ax8BO4HXjTHb/RlnoNLy85yWoWe0/PxLJzVTSqkQFxQ1eqWUUk2niV4ppUKcJnqllApxmuiVUirEaaJXSqkQp4leKaVCnCZ6pZQKcZrolVIqxGmiV0qpEPf/AQKMofICqH8oAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.xscale('log');plt.yscale('log');plt.title(\"Frequency of n-th most common word and 1/n line\")\n", + "plt.plot([c for(w,c) in all_character_counts.most_common()])\n", + "M = all_character_counts.most_common()[0][1]\n", + "plt.plot([M/i for i in range(1,len(all_character_counts))])" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def get_probability_from_counts(counts): # fast method\n", + " total_occurences = sum(counts.values())\n", + " def get_prob(char):\n", + " occurence = counts.get(char,0) # D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.\n", + " return occurence/total_occurences\n", + " return get_prob\n", + "\n", + "get_char_prob = get_probability_from_counts(all_character_counts)\n", + "\n", + "def get_char_prob_slow(char): # slow method\n", + " total_occurences = sum(all_character_counts.values())\n", + " return all_character_counts.get(char,0)/total_occurences\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "def get_running_time(func,arg,times):\n", + " start = time.time()\n", + " for _ in range(times):\n", + " func(arg)\n", + " print('elapsed time for {} runs of {} is {} seconds'.format(times,func.__name__,time.time()-start))" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "elapsed time for 10000 runs of get_prob is 0.0059528350830078125 seconds\n" + ] + } + ], + "source": [ + "get_running_time(get_char_prob,'神',10000)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "elapsed time for 10000 runs of get_char_prob_slow is 0.6741960048675537 seconds\n" + ] + } + ], + "source": [ + "get_running_time(get_char_prob_slow,'神',10000)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "from functools import reduce\n", + "from operator import mul\n", + "def prob_of_string(string):\n", + " return reduce(mul,[get_char_prob(char) for char in string])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.8149833542281e-36" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prob_of_string('这是一个比较常见测试用例')" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.6957089481504437e-37" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prob_of_string('这是一个比较罕见测试用例')" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.2745292803369746e-36\n", + "2.0995356460752042e-33\n" + ] + } + ], + "source": [ + "print(prob_of_string('广州有一个地方叫做沥窖'))\n", + "print(prob_of_string('杭州有一个地方叫做西湖'))" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "pair = \"\"\"前天晚上吃晚饭的时候\n", + "前天晚上吃早饭的时候\"\"\".split('\\n')\n", + "\n", + "pair2 = \"\"\"正是一个好看的小猫\n", + "真是一个好看的小猫\"\"\".split('\\n')\n", + "\n", + "pair3 = \"\"\"我无言以对,简直\n", + "我简直无言以对\"\"\".split('\\n')\n", + "\n", + "pairs = [pair, pair2, pair3]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "def get_probability_prefromance(language_model_func, pairs):\n", + " for (p1, p2) in pairs:\n", + " print('*'*18)\n", + " print('\\t\\t {} with probability {}'.format(p1, language_model_func(tokenize(p1)))) # tokenize去掉','这样的标点\n", + " print('\\t\\t {} with probability {}'.format(p2, language_model_func(tokenize(p2))))" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 1.2207058723774045e-31\n", + "\t\t 前天晚上吃早饭的时候 with probability 1.420433440421635e-31\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 3.2528612289150613e-25\n", + "\t\t 真是一个好看的小猫 with probability 1.0220793879946632e-25\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 3.7425390630342124e-22\n", + "\t\t 我简直无言以对 with probability 3.742539063034212e-22\n" + ] + } + ], + "source": [ + "get_probability_prefromance(prob_of_string, pairs) # 3个结果都不合理" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2-Gram" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$ Pr(w_ow_1w_2...w_n) = Pr(w_1 | w_0) \\cdot Pr(w_2 | w_1) ... \\cdot Pr(w_n | w_{n-1}) $$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$ Pr(w_1 | w_0) = \\frac{Pr(w_1 w_0)}{Pr(w_0)} $$" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "gram_length = 2\n", + "two_gram_counts = Counter([ALL_CHARACTER[i:i+gram_length] for i in range(len(ALL_CHARACTER)-gram_length)])" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('新华', 135490),\n", + " ('华社', 129104),\n", + " ('20', 123427),\n", + " ('nn', 118789),\n", + " ('01', 102583),\n", + " ('17', 81801),\n", + " ('n新', 78433),\n", + " ('中国', 77776),\n", + " ('外代', 74795),\n", + " ('7年', 59051),\n", + " ('记者', 56946),\n", + " ('二线', 55866),\n", + " ('5月', 55491),\n", + " ('代二', 55245),\n", + " ('4月', 51236),\n", + " ('日n', 48360),\n", + " ('月1', 47181),\n", + " ('照片', 46712),\n", + " ('月2', 45268),\n", + " ('社照', 45003),\n", + " ('日在', 39495),\n", + " ('国际', 38923),\n", + " ('发展', 36410),\n", + " ('00', 32399),\n", + " ('国家', 31742),\n", + " ('比赛', 29313),\n", + " ('社记', 27920),\n", + " ('北京', 27639),\n", + " ('美国', 27622),\n", + " ('企业', 27381),\n", + " ('体育', 27169),\n", + " ('赛中', 26877),\n", + " ('公司', 26427),\n", + " ('10', 25595),\n", + " ('工作', 25311),\n", + " ('经济', 25304),\n", + " ('16', 25070),\n", + " ('n当', 25000),\n", + " ('当日', 24291),\n", + " ('20', 24199),\n", + " ('合作', 23932),\n", + " ('举行', 23906),\n", + " ('进行', 23528),\n", + " ('年5', 23416),\n", + " ('n5', 23383),\n", + " ('n4', 23273),\n", + " ('市场', 22666),\n", + " ('足球', 22606),\n", + " ('一个', 22231),\n", + " ('年4', 21904),\n", + " ('世界', 20383),\n", + " ('n外', 20336),\n", + " ('表示', 20215),\n", + " ('1日', 20099),\n", + " ('服务', 20067),\n", + " ('6月', 19602),\n", + " ('政府', 19325),\n", + " ('选手', 18663),\n", + " ('n体', 18616),\n", + " ('建设', 18606),\n", + " ('代2', 18516),\n", + " ('片外', 18512),\n", + " ('联赛', 18165),\n", + " ('摄新', 17810),\n", + " ('项目', 17809),\n", + " ('我们', 17778),\n", + " ('一路', 17295),\n", + " ('一带', 16930),\n", + " ('社会', 16759),\n", + " ('日电', 16731),\n", + " ('带一', 16683),\n", + " ('技术', 16546),\n", + " ('问题', 16448),\n", + " ('文化', 16386),\n", + " ('通过', 16301),\n", + " ('活动', 16137),\n", + " ('决赛', 16108),\n", + " ('00', 15900),\n", + " ('研究', 15864),\n", + " ('12', 15836),\n", + " ('人民', 15693),\n", + " ('11', 15662),\n", + " ('01', 15505),\n", + " ('15', 15328),\n", + " ('目前', 15296),\n", + " ('5日', 14960),\n", + " ('n1', 14932),\n", + " ('3日', 14872),\n", + " ('全国', 14819),\n", + " ('人员', 14664),\n", + " ('投资', 14531),\n", + " ('2日', 14530),\n", + " ('重要', 14453),\n", + " ('中心', 14365),\n", + " ('产业', 14123),\n", + " ('19', 14112),\n", + " ('今年', 13990),\n", + " ('行的', 13810),\n", + " ('地区', 13807),\n", + " ('法国', 13792),\n", + " ('4日', 13749),\n", + " ('的一', 13732),\n", + " ('城市', 13482),\n", + " ('晋级', 13433),\n", + " ('大学', 13406),\n", + " ('成为', 13392),\n", + " ('9日', 13371),\n", + " ('0日', 13222),\n", + " ('安全', 13176),\n", + " ('6日', 13113),\n", + " ('球员', 13102),\n", + " ('30', 12922),\n", + " ('7日', 12919),\n", + " ('8日', 12891),\n", + " ('没有', 12685),\n", + " ('创新', 12627),\n", + " ('管理', 12621),\n", + " ('自己', 12592),\n", + " ('n2', 12575),\n", + " ('组织', 12481),\n", + " ('利亚', 12406),\n", + " ('13', 12396),\n", + " ('他们', 12384),\n", + " ('方面', 12243),\n", + " ('第一', 12231),\n", + " ('时间', 12157),\n", + " ('14', 11928),\n", + " ('可以', 11898),\n", + " ('月3', 11878),\n", + " ('电记', 11810),\n", + " ('这是', 11801),\n", + " ('新闻', 11645),\n", + " ('摄n', 11579),\n", + " ('18', 11553),\n", + " ('10', 11467),\n", + " ('数据', 11404),\n", + " ('上海', 11281),\n", + " ('媒体', 11080),\n", + " ('提供', 11046),\n", + " ('战胜', 11014),\n", + " ('罗斯', 11002),\n", + " ('产品', 10929),\n", + " ('科技', 10926),\n", + " ('学生', 10902),\n", + " ('已经', 10854),\n", + " ('发布', 10817),\n", + " ('总统', 10788),\n", + " ('同时', 10754),\n", + " ('相关', 10727),\n", + " ('认为', 10627),\n", + " ('实现', 10625),\n", + " ('全球', 10570),\n", + " ('队球', 10498),\n", + " ('俄罗', 10380),\n", + " ('信息', 10324),\n", + " ('部门', 10301),\n", + " ('政策', 10252),\n", + " ('香港', 10161),\n", + " ('环境', 10133),\n", + " ('联合', 10123),\n", + " ('改革', 10095),\n", + " ('线足', 10073),\n", + " ('开始', 10069),\n", + " ('英国', 10051),\n", + " ('增长', 10003),\n", + " ('n国', 9939),\n", + " ('公开', 9910),\n", + " ('6年', 9898),\n", + " ('网球', 9876),\n", + " ('可能', 9800),\n", + " ('赛季', 9782),\n", + " ('平台', 9763),\n", + " ('报道', 9727),\n", + " ('作为', 9679),\n", + " ('在比', 9666),\n", + " ('赛n', 9574),\n", + " ('其中', 9548),\n", + " ('银行', 9538),\n", + " ('金融', 9523),\n", + " ('机构', 9346),\n", + " ('参加', 9336),\n", + " ('23', 9327),\n", + " ('支持', 9327),\n", + " ('大利', 9315),\n", + " ('25', 9312),\n", + " ('完新', 9295),\n", + " ('冠军', 9270),\n", + " ('关系', 9260),\n", + " ('教育', 9228),\n", + " ('生活', 9207),\n", + " ('21', 9187),\n", + " ('第二', 9158),\n", + " ('以及', 9147),\n", + " ('是一', 9139),\n", + " ('在2', 9127),\n", + " ('德国', 8979),\n", + " ('旅游', 8965),\n", + " ('获得', 8956),\n", + " ('时装', 8948),\n", + " ('亿元', 8923),\n", + " ('情况', 8913),\n", + " ('开展', 8895),\n", + " ('主要', 8871),\n", + " ('50', 8844),\n", + " ('当地', 8839),\n", + " ('发生', 8802),\n", + " ('日本', 8800),\n", + " ('推进', 8791),\n", + " ('影响', 8739),\n", + " ('生产', 8724),\n", + " ('部分', 8710),\n", + " ('22', 8700),\n", + " ('设计', 8681),\n", + " ('需要', 8623),\n", + " ('一步', 8608),\n", + " ('这一', 8586),\n", + " ('系统', 8582),\n", + " ('n这', 8511),\n", + " ('要求', 8389),\n", + " ('一些', 8372),\n", + " ('我国', 8357),\n", + " ('了一', 8259),\n", + " ('介绍', 8252),\n", + " ('显示', 8227),\n", + " ('进入', 8197),\n", + " ('领域', 8084),\n", + " ('美元', 8060),\n", + " ('新n', 8018),\n", + " ('推动', 8002),\n", + " ('以来', 7949),\n", + " ('电影', 7928),\n", + " ('包括', 7918),\n", + " ('5月', 7916),\n", + " ('这些', 7915),\n", + " ('国内', 7913),\n", + " ('发现', 7890),\n", + " ('62', 7886),\n", + " ('社发', 7857),\n", + " ('4月', 7783),\n", + " ('这个', 7775),\n", + " ('交易', 7765),\n", + " ('保护', 7757),\n", + " ('共同', 7754),\n", + " ('的2', 7753),\n", + " ('使用', 7738),\n", + " ('计划', 7733),\n", + " ('代表', 7703),\n", + " ('一名', 7685),\n", + " ('工程', 7652),\n", + " ('传统', 7639),\n", + " ('之一', 7633),\n", + " ('未来', 7593),\n", + " ('基础', 7591),\n", + " ('年来', 7583),\n", + " ('参与', 7575),\n", + " ('实施', 7568),\n", + " ('n在', 7524),\n", + " ('就是', 7469),\n", + " ('调查', 7446),\n", + " ('24', 7445),\n", + " ('孩子', 7397),\n", + " ('开赛', 7395),\n", + " ('行业', 7341),\n", + " ('上的', 7320),\n", + " ('集团', 7298),\n", + " ('来自', 7296),\n", + " ('资金', 7289),\n", + " ('历史', 7278),\n", + " ('进一', 7264),\n", + " ('年的', 7253),\n", + " ('中央', 7244),\n", + " ('加强', 7242),\n", + " ('科学', 7239),\n", + " ('16', 7193),\n", + " ('28', 7173),\n", + " ('大的', 7171),\n", + " ('队n', 7145),\n", + " ('7赛', 7121),\n", + " ('31', 7115),\n", + " ('了解', 7101),\n", + " ('现在', 7087),\n", + " ('希望', 7085),\n", + " ('出现', 7070),\n", + " ('17', 7024),\n", + " ('方式', 7022),\n", + " ('资源', 7018),\n", + " ('的重', 7007),\n", + " ('26', 6926),\n", + " ('正在', 6906),\n", + " ('负责', 6840),\n", + " ('价格', 6823),\n", + " ('第三', 6804),\n", + " ('运动', 6767),\n", + " ('的中', 6759),\n", + " ('韩国', 6736),\n", + " ('举办', 6733),\n", + " ('个人', 6730),\n", + " ('不断', 6725),\n", + " ('国的', 6721),\n", + " ('n3', 6719),\n", + " ('积极', 6711),\n", + " ('27', 6689),\n", + " ('的人', 6671),\n", + " ('完成', 6661),\n", + " ('巴黎', 6633),\n", + " ('网络', 6633),\n", + " ('持续', 6613),\n", + " ('n6', 6560),\n", + " ('路透', 6533),\n", + " ('赛第', 6484),\n", + " ('年6', 6481),\n", + " ('中的', 6457),\n", + " ('超过', 6453),\n", + " ('月1', 6440),\n", + " ('0多', 6432),\n", + " ('继续', 6429),\n", + " ('也是', 6373),\n", + " ('的是', 6371),\n", + " ('特朗', 6349),\n", + " ('法新', 6339),\n", + " ('交流', 6323),\n", + " ('更多', 6319),\n", + " ('去年', 6315),\n", + " ('朗普', 6315),\n", + " ('月2', 6302),\n", + " ('学院', 6274),\n", + " ('学校', 6268),\n", + " ('会议', 6263),\n", + " ('12', 6255),\n", + " ('庆祝', 6241),\n", + " ('能力', 6236),\n", + " ('其他', 6235),\n", + " ('斯坦', 6230),\n", + " ('来的', 6216),\n", + " ('根据', 6200),\n", + " ('万元', 6172),\n", + " ('人的', 6161),\n", + " ('欧新', 6159),\n", + " ('标准', 6157),\n", + " ('艺术', 6156),\n", + " ('欧洲', 6152),\n", + " ('到了', 6148),\n", + " ('贸易', 6135),\n", + " ('社欧', 6129),\n", + " ('全面', 6129),\n", + " ('制造', 6122),\n", + " ('提高', 6118),\n", + " ('提升', 6100),\n", + " ('70', 6084),\n", + " ('主席', 6077),\n", + " ('30', 6065),\n", + " ('一次', 6046),\n", + " ('委员', 6033),\n", + " ('汽车', 6018),\n", + " ('3月', 6001),\n", + " ('11', 5998),\n", + " ('对于', 5989),\n", + " ('如果', 5989),\n", + " ('n一', 5965),\n", + " ('生态', 5962),\n", + " ('现场', 5961),\n", + " ('开发', 5960),\n", + " ('15', 5946),\n", + " ('主场', 5943),\n", + " ('n据', 5941),\n", + " ('有关', 5940),\n", + " ('的新', 5928),\n", + " ('战略', 5892),\n", + " ('专家', 5878),\n", + " ('首都', 5850),\n", + " ('有限', 5842),\n", + " ('29', 5842),\n", + " ('业的', 5831),\n", + " ('江苏', 5821),\n", + " ('因为', 5812),\n", + " ('台湾', 5780),\n", + " ('场以', 5773),\n", + " ('专业', 5770),\n", + " ('最大', 5768),\n", + " ('40', 5749),\n", + " ('河北', 5745),\n", + " ('成功', 5721),\n", + " ('n中', 5719),\n", + " ('开放', 5702),\n", + " ('19', 5692),\n", + " ('增加', 5690),\n", + " ('展示', 5671),\n", + " ('重点', 5668),\n", + " ('队主', 5662),\n", + " ('建立', 5654),\n", + " ('人们', 5649),\n", + " ('智能', 5648),\n", + " ('提出', 5647),\n", + " ('国人', 5594),\n", + " ('规模', 5591),\n", + " ('比0', 5577),\n", + " ('交通', 5576),\n", + " ('群众', 5575),\n", + " ('民币', 5575),\n", + " ('正式', 5573),\n", + " ('社法', 5557),\n", + " ('报告', 5554),\n", + " ('BA', 5552),\n", + " ('工业', 5552),\n", + " ('的时', 5529),\n", + " ('行为', 5516),\n", + " ('社北', 5509),\n", + " ('新的', 5500),\n", + " ('领导', 5477),\n", + " ('0万', 5471),\n", + " ('健康', 5468),\n", + " ('消费', 5467),\n", + " ('品牌', 5461),\n", + " ('的大', 5449),\n", + " ('会上', 5442),\n", + " ('水平', 5432),\n", + " ('行动', 5429),\n", + " ('促进', 5413),\n", + " ('5年', 5408),\n", + " ('造成', 5399),\n", + " ('接受', 5377),\n", + " ('论坛', 5374),\n", + " ('澳大', 5371),\n", + " ('基金', 5371),\n", + " ('事件', 5366),\n", + " ('不同', 5361),\n", + " ('农业', 5347),\n", + " ('很多', 5336),\n", + " ('学习', 5313),\n", + " ('0年', 5311),\n", + " ('两国', 5305),\n", + " ('国选', 5304),\n", + " ('互联', 5302),\n", + " ('们的', 5301),\n", + " ('利用', 5295),\n", + " ('线网', 5292),\n", + " ('游客', 5291),\n", + " ('尼亚', 5290),\n", + " ('风险', 5281),\n", + " ('公里', 5281),\n", + " ('说n', 5269),\n", + " ('铁路', 5252),\n", + " ('轮比', 5245),\n", + " ('系列', 5244),\n", + " ('业务', 5239),\n", + " ('规定', 5234),\n", + " ('在一', 5231),\n", + " ('这样', 5229),\n", + " ('广州', 5224),\n", + " ('责任', 5208),\n", + " ('的发', 5189),\n", + " ('区的', 5187),\n", + " ('广东', 5182),\n", + " ('解决', 5182),\n", + " ('1外', 5173),\n", + " ('政治', 5158),\n", + " ('篮球', 5154),\n", + " ('当天', 5141),\n", + " ('双方', 5117),\n", + " ('监管', 5116),\n", + " ('关注', 5100),\n", + " ('比1', 5094),\n", + " ('社路', 5093),\n", + " ('男子', 5085),\n", + " ('带来', 5075),\n", + " ('戛纳', 5065),\n", + " ('作品', 5058),\n", + " ('习近', 5055),\n", + " ('近平', 5054),\n", + " ('有一', 5049),\n", + " ('地方', 5043),\n", + " ('随着', 5039),\n", + " ('综合', 5030),\n", + " ('医院', 5022),\n", + " ('标题', 5007),\n", + " ('2外', 4981),\n", + " ('60', 4960),\n", + " ('医疗', 4955),\n", + " ('用户', 4954),\n", + " ('内容', 4952),\n", + " ('个月', 4939),\n", + " ('不仅', 4937),\n", + " ('行n', 4933),\n", + " ('开幕', 4928),\n", + " ('设施', 4919),\n", + " ('月5', 4917),\n", + " ('形成', 4912),\n", + " ('规划', 4909),\n", + " ('区域', 4909),\n", + " ('己的', 4904),\n", + " ('首次', 4904),\n", + " ('分别', 4901),\n", + " ('拍摄', 4900),\n", + " ('非常', 4885),\n", + " ('精神', 4877),\n", + " ('达到', 4860),\n", + " ('人士', 4857),\n", + " ('贫困', 4851),\n", + " ('位于', 4850),\n", + " ('存在', 4846),\n", + " ('由于', 4831),\n", + " ('需求', 4820),\n", + " ('分析', 4803),\n", + " ('2比', 4801),\n", + " ('期间', 4780),\n", + " ('机制', 4775),\n", + " ('过程', 4775),\n", + " ('1日', 4770),\n", + " ('以上', 4748),\n", + " ('制度', 4739),\n", + " ('儿童', 4732),\n", + " ('在法', 4724),\n", + " ('n图', 4715),\n", + " ('取得', 4713),\n", + " ('球n', 4711),\n", + " ('基本', 4704),\n", + " ('新疆', 4703),\n", + " ('不是', 4700),\n", + " ('塞尔', 4681),\n", + " ('帮助', 4673),\n", + " ('3外', 4670),\n", + " ('特别', 4664),\n", + " ('完n', 4661),\n", + " ('创业', 4646),\n", + " ('什么', 4641),\n", + " ('展的', 4641),\n", + " ('在这', 4639),\n", + " ('国国', 4636),\n", + " ('一起', 4635),\n", + " ('限公', 4634),\n", + " ('决定', 4631),\n", + " ('的第', 4624),\n", + " ('山东', 4602),\n", + " ('保障', 4602),\n", + " ('目标', 4580),\n", + " ('天津', 4579),\n", + " ('最高', 4576),\n", + " ('联网', 4574),\n", + " ('指出', 4570),\n", + " ('基地', 4568),\n", + " ('宣布', 4567),\n", + " ('员会', 4552),\n", + " ('NB', 4552),\n", + " ('年1', 4546),\n", + " ('青年', 4544),\n", + " ('为了', 4542),\n", + " ('里的', 4542),\n", + " ('还是', 4541),\n", + " ('强n', 4538),\n", + " ('训练', 4538),\n", + " ('意大', 4527),\n", + " ('处理', 4518),\n", + " ('单位', 4512),\n", + " ('作用', 4510),\n", + " ('人才', 4509),\n", + " ('此次', 4502),\n", + " ('收入', 4499),\n", + " ('优势', 4495),\n", + " ('的国', 4493),\n", + " ('好的', 4474),\n", + " ('月4', 4452),\n", + " ('空间', 4442),\n", + " ('打造', 4441),\n", + " ('书记', 4433),\n", + " ('浙江', 4430),\n", + " ('一直', 4428),\n", + " ('所有', 4421),\n", + " ('消息', 4419),\n", + " ('最终', 4411),\n", + " ('装周', 4408),\n", + " ('关键', 4397),\n", + " ('模式', 4390),\n", + " ('0战', 4387),\n", + " ('体系', 4364),\n", + " ('80', 4353),\n", + " ('作的', 4349),\n", + " ('还有', 4346),\n", + " ('成立', 4345),\n", + " ('如何', 4345),\n", + " ('经营', 4337),\n", + " ('这种', 4332),\n", + " ('50', 4331),\n", + " ('以2', 4328),\n", + " ('印度', 4328),\n", + " ('电视', 4313),\n", + " ('4外', 4301),\n", + " ('出了', 4295),\n", + " ('销售', 4295),\n", + " ('图表', 4285),\n", + " ('之后', 4283),\n", + " ('近年', 4275),\n", + " ('在中', 4271),\n", + " ('小时', 4266),\n", + " ('京2', 4259),\n", + " ('吸引', 4257),\n", + " ('标赛', 4253),\n", + " ('在北', 4247),\n", + " ('动物', 4231),\n", + " ('女子', 4226),\n", + " ('任务', 4218),\n", + " ('锦标', 4215),\n", + " ('6年', 4210),\n", + " ('具有', 4208),\n", + " ('西班', 4205),\n", + " ('月6', 4203),\n", + " ('武汉', 4198),\n", + " ('班牙', 4195),\n", + " ('重大', 4194),\n", + " ('之路', 4185),\n", + " ('协议', 4183),\n", + " ('启动', 4181),\n", + " ('多的', 4168),\n", + " ('按照', 4155),\n", + " ('02', 4155),\n", + " ('影片', 4150),\n", + " ('有效', 4150),\n", + " ('13', 4147),\n", + " ('样的', 4146),\n", + " ('袭击', 4140),\n", + " ('超联', 4139),\n", + " ('一年', 4135),\n", + " ('纽约', 4133),\n", + " ('受到', 4127),\n", + " ('航空', 4126),\n", + " ('英超', 4126),\n", + " ('n小', 4123),\n", + " ('死亡', 4113),\n", + " ('能够', 4111),\n", + " ('指数', 4105),\n", + " ('关于', 4103),\n", + " ('选择', 4102),\n", + " ('不少', 4101),\n", + " ('出的', 4101),\n", + " ('不能', 4094),\n", + " ('得到', 4085),\n", + " ('德里', 4078),\n", + " ('军n', 4073),\n", + " ('文明', 4073),\n", + " ('共享', 4065),\n", + " ('月9', 4056),\n", + " ('扶贫', 4051),\n", + " ('近日', 4048),\n", + " ('国队', 4042),\n", + " ('最后', 4040),\n", + " ('都是', 4037),\n", + " ('美联', 4029),\n", + " ('能源', 4029),\n", + " ('成果', 4023),\n", + " ('表演', 4022),\n", + " ('14', 4021),\n", + " ('稳定', 4016),\n", + " ('选举', 4011),\n", + " ('斯特', 4003),\n", + " ('主题', 4002),\n", + " ('伊斯', 3995),\n", + " ('合国', 3989),\n", + " ('的主', 3989),\n", + " ('博物', 3982),\n", + " ('球法', 3982),\n", + " ('措施', 3970),\n", + " ('动n', 3967),\n", + " ('统计', 3963),\n", + " ('坚持', 3962),\n", + " ('一家', 3959),\n", + " ('看到', 3957),\n", + " ('夺冠', 3950),\n", + " ('主任', 3950),\n", + " ('预计', 3950),\n", + " ('教练', 3946),\n", + " ('将于', 3940),\n", + " ('在美', 3940),\n", + " ('下一', 3928),\n", + " ('岁的', 3928),\n", + " ('日新', 3927),\n", + " ('家庭', 3919),\n", + " ('成了', 3919),\n", + " ('上涨', 3910),\n", + " ('叙利', 3902),\n", + " ('乒乓', 3901),\n", + " ('摄的', 3897),\n", + " ('将在', 3890),\n", + " ('同比', 3888),\n", + " ('日报', 3874),\n", + " ('乓球', 3873),\n", + " ('的生', 3873),\n", + " ('落实', 3869),\n", + " ('阶段', 3864),\n", + " ('结果', 3859),\n", + " ('5外', 3856),\n", + " ('努力', 3855),\n", + " ('警方', 3853),\n", + " ('条件', 3853),\n", + " ('多个', 3838),\n", + " ('此外', 3827),\n", + " ('直接', 3826),\n", + " ('的工', 3826),\n", + " ('球英', 3826),\n", + " ('团队', 3822),\n", + " ('法律', 3820),\n", + " ('分钟', 3818),\n", + " ('是在', 3814),\n", + " ('全部', 3813),\n", + " ('变化', 3812),\n", + " ('建筑', 3808),\n", + " ('结构', 3804),\n", + " ('18', 3804),\n", + " ('公布', 3804),\n", + " ('责人', 3803),\n", + " ('编辑', 3801),\n", + " ('不过', 3800),\n", + " ('化的', 3795),\n", + " ('告诉', 3787),\n", + " ('虽然', 3780),\n", + " ('两个', 3778),\n", + " ('月7', 3777),\n", + " ('重庆', 3775),\n", + " ('5日', 3773),\n", + " ('时候', 3767),\n", + " ('7年', 3762),\n", + " ('发挥', 3762),\n", + " ('小标', 3762),\n", + " ('方案', 3761),\n", + " ('保持', 3758),\n", + " ('严重', 3756),\n", + " ('的成', 3755),\n", + " ('出席', 3755),\n", + " ('物馆', 3740),\n", + " ('河南', 3739),\n", + " ('播发', 3735),\n", + " ('经过', 3734),\n", + " ('执行', 3733),\n", + " ('目的', 3730),\n", + " ('毕业', 3719),\n", + " ('朝鲜', 3717),\n", + " ('更加', 3709),\n", + " ('中n', 3700),\n", + " ('日中', 3695),\n", + " ('结束', 3695),\n", + " ('这里', 3693),\n", + " ('核心', 3687),\n", + " ('伦敦', 3678),\n", + " ('的地', 3673),\n", + " ('明显', 3673),\n", + " ('成绩', 3672),\n", + " ('是中', 3672),\n", + " ('球队', 3663),\n", + " ('国务', 3660),\n", + " ('质量', 3659),\n", + " ('锦赛', 3657),\n", + " ('以1', 3654),\n", + " ('为中', 3652),\n", + " ('有的', 3647),\n", + " ('25', 3646),\n", + " ('各地', 3645),\n", + " ('调整', 3644),\n", + " ('万人', 3643),\n", + " ('他的', 3642),\n", + " ('倡议', 3639),\n", + " ('斯科', 3638),\n", + " ('地产', 3637),\n", + " ('法院', 3633),\n", + " ('村民', 3632),\n", + " ('针对', 3632),\n", + " ('行了', 3611),\n", + " ('环保', 3608),\n", + " ('行政', 3608),\n", + " ('之间', 3602),\n", + " ('意见', 3597),\n", + " ('半决', 3592),\n", + " ('卫星', 3590),\n", + " ('新新', 3579),\n", + " ('大会', 3579),\n", + " ('展n', 3577),\n", + " ('内的', 3577),\n", + " ('1战', 3569),\n", + " ('后的', 3563),\n", + " ('生的', 3562),\n", + " ('影节', 3562),\n", + " ('现了', 3557),\n", + " ('家的', 3554),\n", + " ('面积', 3553),\n", + " ('特色', 3551),\n", + " ('高峰', 3550),\n", + " ('小学', 3547),\n", + " ('成本', 3543),\n", + " ('来越', 3543),\n", + " ('自然', 3540),\n", + " ('一种', 3539),\n", + " ('成员', 3537),\n", + " ('n2', 3532),\n", + " ('长期', 3531),\n", + " ('干部', 3531),\n", + " ('秀n', 3531),\n", + " ('时代', 3530),\n", + " ('脱贫', 3526),\n", + " ('仪式', 3524),\n", + " ('协会', 3516),\n", + " ('日摄', 3509),\n", + " ('过去', 3499),\n", + " ('价值', 3491),\n", + " ('导致', 3487),\n", + " ('一场', 3485),\n", + " ('居民', 3485),\n", + " ('截至', 3481),\n", + " ('音乐', 3481),\n", + " ('马德', 3475),\n", + " ('运会', 3473),\n", + " ('的情', 3471),\n", + " ('要的', 3469),\n", + " ('机会', 3469),\n", + " ('知识', 3465),\n", + " ('越来', 3463),\n", + " ('主义', 3459),\n", + " ('3日', 3456),\n", + " ('布会', 3455),\n", + " ('欧盟', 3450),\n", + " ('公安', 3449),\n", + " ('卫生', 3441),\n", + " ('职业', 3435),\n", + " ('外交', 3429),\n", + " ('准备', 3424),\n", + " ('场的', 3422),\n", + " ('范围', 3422),\n", + " ('6外', 3422),\n", + " ('不会', 3420),\n", + " ('但是', 3418),\n", + " ('南省', 3407),\n", + " ('甚至', 3404),\n", + " ('一定', 3402),\n", + " ('因此', 3400),\n", + " ('湖北', 3398),\n", + " ('民族', 3393),\n", + " ('升级', 3381),\n", + " ('级n', 3379),\n", + " ('节n', 3378),\n", + " ('应用', 3378),\n", + " ('控制', 3377),\n", + " ('甲联', 3372),\n", + " ('垃圾', 3372),\n", + " ('为主', 3369),\n", + " ('法网', 3367),\n", + " ('6日', 3365),\n", + " ('3比', 3362),\n", + " ('国总', 3362),\n", + " ('季度', 3356),\n", + " ('原因', 3356),\n", + " ('实际', 3356),\n", + " ('明确', 3355),\n", + " ('商品', 3353),\n", + " ('马拉', 3351),\n", + " ('公园', 3349),\n", + " ('每年', 3349),\n", + " ('网站', 3347),\n", + " ('克斯', 3346),\n", + " ('的最', 3338),\n", + " ('自由', 3335),\n", + " ('冠n', 3334),\n", + " ('2日', 3333),\n", + " ('犯罪', 3331),\n", + " ('市民', 3330),\n", + " ('部长', 3330),\n", + " ('巴西', 3329),\n", + " ('者李', 3327),\n", + " ('完善', 3326),\n", + " ('手机', 3325),\n", + " ('赛男', 3324),\n", + " ('羽毛', 3323),\n", + " ('和平', 3321),\n", + " ('日前', 3313),\n", + " ('人在', 3312),\n", + " ('培训', 3303),\n", + " ('毛球', 3291),\n", + " ('无人', 3288),\n", + " ('必须', 3287),\n", + " ('月8', 3284),\n", + " ('中华', 3281),\n", + " ('机器', 3279),\n", + " ('就业', 3279),\n", + " ('设备', 3276),\n", + " ('如今', 3265),\n", + " ('育1', 3257),\n", + " ('西亚', 3254),\n", + " ('体验', 3253),\n", + " ('的高', 3250),\n", + " ('布的', 3249),\n", + " ('加快', 3248),\n", + " ('0日', 3246),\n", + " ('多年', 3246),\n", + " ('并不', 3240),\n", + " ('附近', 3238),\n", + " ('功能', 3237),\n", + " ('21', 3236),\n", + " ('公共', 3230),\n", + " ('州市', 3226),\n", + " ('航天', 3221),\n", + " ('分之', 3218),\n", + " ('运营', 3210),\n", + " ('和国', 3210),\n", + " ('资产', 3207),\n", + " ('幕n', 3203),\n", + " ('4日', 3202),\n", + " ('地的', 3201),\n", + " ('挑战', 3184),\n", + " ('土耳', 3179),\n", + " ('耳其', 3179),\n", + " ('飞机', 3178),\n", + " ('35', 3176),\n", + " ('集中', 3175),\n", + " ('社区', 3174),\n", + " ('欧冠', 3169),\n", + " ('定的', 3165),\n", + " ('农民', 3164),\n", + " ('亚洲', 3163),\n", + " ('们在', 3162),\n", + " ('式n', 3161),\n", + " ('05', 3160),\n", + " ('图片', 3158),\n", + " ('现代', 3155),\n", + " ('程中', 3152),\n", + " ('4年', 3152),\n", + " ('新区', 3151),\n", + " ('突破', 3149),\n", + " ('0多', 3148),\n", + " ('n社', 3144),\n", + " ('亿美', 3140),\n", + " ('除了', 3127),\n", + " ('1月', 3126),\n", + " ('只有', 3126),\n", + " ('中中', 3124),\n", + " ('者王', 3117),\n", + " ('会在', 3116),\n", + " ('莫斯', 3116),\n", + " ('为一', 3114),\n", + " ('竞争', 3113),\n", + " ('贵州', 3107),\n", + " ('产生', 3104),\n", + " ('总理', 3102),\n", + " ('深入', 3101),\n", + " ('深圳', 3101),\n", + " ('委会', 3101),\n", + " ('发表', 3097),\n", + " ('案件', 3097),\n", + " ('声明', 3095),\n", + " ('商业', 3094),\n", + " ('中超', 3090),\n", + " ('大家', 3087),\n", + " ('力量', 3080),\n", + " ('沿线', 3079),\n", + " ('左右', 3077),\n", + " ('99', 3074),\n", + " ('平方', 3073),\n", + " ('7日', 3072),\n", + " ('摄影', 3067),\n", + " ('年前', 3066),\n", + " ('的比', 3063),\n", + " ('人n', 3060),\n", + " ('保险', 3051),\n", + " ('两岸', 3050),\n", + " ('第十', 3044),\n", + " ('以3', 3043),\n", + " ('球欧', 3040),\n", + " ('展开', 3038),\n", + " ('第3', 3032),\n", + " ('采访', 3032),\n", + " ('办公', 3031),\n", + " ('36', 3029),\n", + " ('0亿', 3023),\n", + " ('广西', 3022),\n", + " ('农村', 3021),\n", + " ('观众', 3017),\n", + " ('电子', 3005),\n", + " ('连续', 3005),\n", + " ('研发', 3005),\n", + " ('电话', 3004),\n", + " ('机关', 3002),\n", + " ('起来', 2998),\n", + " ('片北', 2994),\n", + " ('自治', 2991),\n", + " ('四川', 2988),\n", + " ('一位', 2983),\n", + " ('融资', 2982),\n", + " ('本次', 2975),\n", + " ('作人', 2975),\n", + " ('的基', 2973),\n", + " ('罗马', 2970),\n", + " ('7外', 2964),\n", + " ('另一', 2963),\n", + " ('结合', 2961),\n", + " ('面的', 2960),\n", + " ('他说', 2959),\n", + " ('视频', 2959),\n", + " ('治理', 2957),\n", + " ('个国', 2956),\n", + " ('此前', 2953),\n", + " ('23', 2952),\n", + " ('下降', 2951),\n", + " ('检查', 2951),\n", + " ('22', 2948),\n", + " ('支付', 2948),\n", + " ('食品', 2947),\n", + " ...]" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "two_gram_counts.most_common()[0:100]" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [], + "source": [ + "get_pair_prob = get_probability_from_counts(two_gram_counts)\n", + "\n", + "def get_2_gram_prob(word,prev): # probability of seeing \"word\" given \"prev\"\n", + " if get_pair_prob(prev+word) > 0:\n", + " return get_pair_prob(prev+word)/get_char_prob(prev)\n", + " else:\n", + " return get_char_prob(word)\n", + "\n", + "def get_2_gram_string_prob(string):\n", + " probList = []\n", + " for i,c in enumerate(string):\n", + " prev = '' if i == 0 else string[i-1]\n", + " probList.append(get_2_gram_prob(c,prev))\n", + " return reduce(mul,probList)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 1.2207058723774045e-31\n", + "\t\t 前天晚上吃早饭的时候 with probability 1.420433440421635e-31\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 3.2528612289150613e-25\n", + "\t\t 真是一个好看的小猫 with probability 1.0220793879946632e-25\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 3.7425390630342124e-22\n", + "\t\t 我简直无言以对 with probability 3.742539063034212e-22\n" + ] + } + ], + "source": [ + "get_probability_prefromance(prob_of_string, pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 6.847690505341362e-20\n", + "\t\t 前天晚上吃早饭的时候 with probability 1.7483929208056836e-19\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 1.313877986865493e-16\n", + "\t\t 真是一个好看的小猫 with probability 8.984863857283642e-17\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 2.143887688284483e-17\n", + "\t\t 我简直无言以对 with probability 2.0730979185519055e-17\n" + ] + } + ], + "source": [ + "get_probability_prefromance(get_2_gram_string_prob, pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.057921115909864e-15" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.6195189821101717e-12" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "380434793" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('的', 9938192),\n", + " ('1', 5740539),\n", + " ('0', 4559519),\n", + " ('年', 4088849),\n", + " ('2', 3705103),\n", + " ('一', 3174566),\n", + " ('在', 3142422),\n", + " ('是', 2800422),\n", + " ('中', 2763222),\n", + " ('9', 2730241),\n", + " ('人', 2610319),\n", + " ('大', 2095073),\n", + " ('有', 2064509),\n", + " ('e', 1885083),\n", + " ('a', 1789303),\n", + " ('3', 1753587),\n", + " ('5', 1721315),\n", + " ('和', 1705550),\n", + " ('為', 1662714),\n", + " ('8', 1646008),\n", + " ('以', 1611294),\n", + " ('4', 1579101),\n", + " ('月', 1527145),\n", + " ('日', 1511531),\n", + " ('6', 1501304),\n", + " ('o', 1452120),\n", + " ('7', 1440467),\n", + " ('了', 1434073),\n", + " ('国', 1417423),\n", + " ('i', 1396361),\n", + " ('为', 1360856),\n", + " ('n', 1348198),\n", + " ('r', 1329195),\n", + " ('地', 1309342),\n", + " ('於', 1292718),\n", + " ('國', 1290442),\n", + " ('成', 1267631),\n", + " ('公', 1228434),\n", + " ('上', 1220226),\n", + " ('不', 1198193),\n", + " ('出', 1152193),\n", + " ('行', 1134022),\n", + " ('他', 1128358),\n", + " ('部', 1117122),\n", + " ('斯', 1107951),\n", + " ('t', 1088959),\n", + " ('之', 1060971),\n", + " ('名', 1027713),\n", + " ('其', 994654),\n", + " ('主', 993612),\n", + " ('作', 977896),\n", + " ('生', 967962),\n", + " ('l', 964757),\n", + " ('分', 953857),\n", + " ('方', 948753),\n", + " ('于', 929041),\n", + " ('s', 920003),\n", + " ('下', 915048),\n", + " ('西', 912137),\n", + " ('用', 904531),\n", + " ('到', 904395),\n", + " ('第', 898354),\n", + " ('家', 865004),\n", + " ('位', 862875),\n", + " ('而', 862821),\n", + " ('及', 857057),\n", + " ('由', 856546),\n", + " ('後', 834189),\n", + " ('時', 830986),\n", + " ('被', 817636),\n", + " ('本', 810581),\n", + " ('民', 808941),\n", + " ('法', 804770),\n", + " ('南', 804650),\n", + " ('学', 798939),\n", + " ('市', 791625),\n", + " ('里', 778211),\n", + " ('會', 777237),\n", + " ('同', 771726),\n", + " ('个', 763306),\n", + " ('多', 761161),\n", + " ('可', 760766),\n", + " ('政', 753960),\n", + " ('子', 747029),\n", + " ('自', 721717),\n", + " ('任', 719743),\n", + " ('特', 708178),\n", + " ('北', 705727),\n", + " ('文', 703854),\n", + " ('高', 684065),\n", + " ('加', 682447),\n", + " ('德', 679369),\n", + " ('海', 676572),\n", + " ('最', 676398),\n", + " ('新', 673906),\n", + " ('與', 670787),\n", + " ('克', 665932),\n", + " ('u', 665502),\n", + " ('后', 658911),\n", + " ('也', 657383),\n", + " ('代', 657013),\n", + " ('前', 656735),\n", + " ('等', 654057),\n", + " ('利', 651952),\n", + " ('平', 650907),\n", + " ('所', 646943),\n", + " ('學', 637814),\n", + " ('三', 637252),\n", + " ('h', 633489),\n", + " ('道', 627641),\n", + " ('度', 625007),\n", + " ('個', 624395),\n", + " ('因', 622385),\n", + " ('此', 620558),\n", + " ('面', 620391),\n", + " ('建', 618519),\n", + " ('得', 617685),\n", + " ('美', 610494),\n", + " ('物', 610144),\n", + " ('要', 602652),\n", + " ('教', 587490),\n", + " ('山', 585261),\n", + " ('时', 578547),\n", + " ('至', 575987),\n", + " ('事', 574421),\n", + " ('小', 573926),\n", + " ('路', 571932),\n", + " ('立', 571585),\n", + " ('拉', 568622),\n", + " ('口', 563811),\n", + " ('c', 561702),\n", + " ('会', 560024),\n", + " ('m', 559579),\n", + " ('尔', 559186),\n", + " ('入', 558897),\n", + " ('能', 554237),\n", + " ('二', 550386),\n", + " ('世', 550038),\n", + " ('發', 542170),\n", + " ('理', 539290),\n", + " ('d', 536237),\n", + " ('天', 532117),\n", + " ('工', 532015),\n", + " ('外', 530203),\n", + " ('合', 529451),\n", + " ('区', 528732),\n", + " ('表', 527233),\n", + " ('州', 526788),\n", + " ('S', 526181),\n", + " ('十', 523368),\n", + " ('定', 523324),\n", + " ('球', 520041),\n", + " ('A', 518904),\n", + " ('但', 516365),\n", + " ('期', 515809),\n", + " ('區', 514889),\n", + " ('化', 514021),\n", + " ('台', 512971),\n", + " ('科', 507284),\n", + " ('使', 506784),\n", + " ('全', 502586),\n", + " ('次', 502296),\n", + " ('性', 489355),\n", + " ('通', 484795),\n", + " ('者', 477756),\n", + " ('共', 469358),\n", + " ('C', 466320),\n", + " ('式', 463738),\n", + " ('重', 460538),\n", + " ('都', 458267),\n", + " ('並', 447402),\n", + " ('力', 446857),\n", + " ('來', 444149),\n", + " ('发', 443728),\n", + " ('安', 442999),\n", + " ('士', 442969),\n", + " ('布', 440096),\n", + " ('軍', 433577),\n", + " ('军', 425462),\n", + " ('比', 423014),\n", + " ('王', 420613),\n", + " ('城', 419587),\n", + " ('基', 418242),\n", + " ('原', 418019),\n", + " ('相', 417754),\n", + " ('院', 416280),\n", + " ('站', 414110),\n", + " ('省', 412008),\n", + " ('明', 410219),\n", + " ('動', 406572),\n", + " ('目', 406002),\n", + " ('爾', 405283),\n", + " ('長', 403563),\n", + " ('这', 401324),\n", + " ('水', 400762),\n", + " ('司', 397010),\n", + " ('對', 393805),\n", + " ('正', 393688),\n", + " ('系', 392783),\n", + " ('治', 392555),\n", + " ('金', 392494),\n", + " ('英', 390793),\n", + " ('格', 390268),\n", + " ('米', 389906),\n", + " ('尼', 388701),\n", + " ('這', 388498),\n", + " ('府', 386464),\n", + " ('開', 385362),\n", + " ('M', 384393),\n", + " ('长', 382519),\n", + " ('与', 380872),\n", + " ('制', 378642),\n", + " ('四', 375847),\n", + " ('如', 372625),\n", + " ('或', 371731),\n", + " ('電', 371551),\n", + " ('列', 369194),\n", + " ('巴', 368453),\n", + " ('元', 367562),\n", + " ('星', 365070),\n", + " ('T', 364514),\n", + " ('港', 364113),\n", + " ('影', 362517),\n", + " ('起', 362407),\n", + " ('員', 360173),\n", + " ('現', 359882),\n", + " ('g', 359127),\n", + " ('亞', 358195),\n", + " ('对', 355573),\n", + " ('經', 354948),\n", + " ('首', 354483),\n", + " ('常', 354237),\n", + " ('東', 353661),\n", + " ('女', 353631),\n", + " ('提', 350015),\n", + " ('阿', 346882),\n", + " ('進', 344636),\n", + " ('改', 344086),\n", + " ('當', 341120),\n", + " ('来', 340798),\n", + " ('演', 339729),\n", + " ('林', 336815),\n", + " ('間', 335712),\n", + " ('B', 334835),\n", + " ('量', 334536),\n", + " ('并', 334134),\n", + " ('心', 332839),\n", + " ('過', 332363),\n", + " ('然', 330960),\n", + " ('向', 328494),\n", + " ('就', 328304),\n", + " ('东', 328088),\n", + " ('P', 326436),\n", + " ('始', 324881),\n", + " ('开', 323850),\n", + " ('接', 323779),\n", + " ('戰', 323477),\n", + " ('賽', 322898),\n", + " ('D', 322726),\n", + " ('界', 318638),\n", + " ('p', 318016),\n", + " ('體', 317834),\n", + " ('y', 314733),\n", + " ('曾', 313637),\n", + " ('动', 310998),\n", + " ('意', 310932),\n", + " ('將', 310562),\n", + " ('管', 307647),\n", + " ('機', 307041),\n", + " ('河', 307009),\n", + " ('車', 306644),\n", + " ('卡', 304322),\n", + " ('內', 304174),\n", + " ('镇', 303953),\n", + " ('手', 303526),\n", + " ('受', 302250),\n", + " ('员', 302057),\n", + " ('委', 300752),\n", + " ('程', 300520),\n", + " ('形', 299760),\n", + " ('交', 297947),\n", + " ('指', 296897),\n", + " ('色', 296783),\n", + " ('保', 296578),\n", + " ('場', 296368),\n", + " ('包', 295042),\n", + " ('I', 294062),\n", + " ('流', 292640),\n", + " ('型', 290557),\n", + " ('亚', 290181),\n", + " ('机', 290172),\n", + " ('已', 289800),\n", + " ('f', 288668),\n", + " ('香', 286281),\n", + " ('音', 285585),\n", + " ('些', 285145),\n", + " ('直', 283736),\n", + " ('内', 283452),\n", + " ('馬', 280524),\n", + " ('总', 279126),\n", + " ('太', 275584),\n", + " ('经', 274329),\n", + " ('集', 274113),\n", + " ('史', 273344),\n", + " ('身', 273032),\n", + " ('稱', 271301),\n", + " ('夫', 270599),\n", + " ('線', 270406),\n", + " ('过', 270359),\n", + " ('隊', 270205),\n", + " ('进', 268462),\n", + " ('五', 267428),\n", + " ('字', 267046),\n", + " ('該', 264882),\n", + " ('校', 264239),\n", + " ('品', 264130),\n", + " ('業', 263128),\n", + " ('县', 262995),\n", + " ('空', 262695),\n", + " ('现', 262407),\n", + " ('朝', 262041),\n", + " ('E', 260395),\n", + " ('种', 259792),\n", + " ('车', 258938),\n", + " ('更', 258819),\n", + " ('江', 257888),\n", + " ('R', 256996),\n", + " ('種', 256163),\n", + " ('展', 255724),\n", + " ('古', 255601),\n", + " ('社', 255211),\n", + " ('该', 254463),\n", + " ('电', 254124),\n", + " ('示', 252980),\n", + " ('间', 252767),\n", + " ('兩', 252402),\n", + " ('設', 252154),\n", + " ('马', 250001),\n", + " ('灣', 249947),\n", + " ('各', 247403),\n", + " ('持', 247021),\n", + " ('业', 246672),\n", + " ('洲', 245773),\n", + " ('著', 244952),\n", + " ('N', 244899),\n", + " ('官', 242881),\n", + " ('将', 241701),\n", + " ('回', 241313),\n", + " ('數', 241246),\n", + " ('反', 241211),\n", + " ('命', 240106),\n", + " ('体', 237646),\n", + " ('k', 237388),\n", + " ('战', 237369),\n", + " ('G', 234625),\n", + " ('京', 234164),\n", + " ('L', 233805),\n", + " ('活', 233002),\n", + " ('取', 232975),\n", + " ('羅', 231230),\n", + " ('光', 231063),\n", + " ('造', 230981),\n", + " ('達', 230526),\n", + " ('信', 227626),\n", + " ('選', 227508),\n", + " ('b', 226561),\n", + " ('石', 226459),\n", + " ('少', 225706),\n", + " ('只', 223992),\n", + " ('支', 223568),\n", + " ('村', 223319),\n", + " ('近', 223234),\n", + " ('族', 223081),\n", + " ('神', 222888),\n", + " ('清', 222725),\n", + " ('称', 222196),\n", + " ('曲', 221024),\n", + " ('普', 220896),\n", + " ('局', 220552),\n", + " ('当', 220275),\n", + " ('又', 219805),\n", + " ('属', 219701),\n", + " ('解', 219554),\n", + " ('約', 219466),\n", + " ('它', 219168),\n", + " ('设', 219067),\n", + " ('罗', 218750),\n", + " ('H', 218102),\n", + " ('情', 217894),\n", + " ('聯', 217887),\n", + " ('数', 217489),\n", + " ('蘭', 217217),\n", + " ('非', 216450),\n", + " ('队', 215761),\n", + " ('每', 215661),\n", + " ('研', 213291),\n", + " ('O', 212849),\n", + " ('運', 212729),\n", + " ('波', 212046),\n", + " ('號', 211509),\n", + " ('件', 210305),\n", + " ('角', 209998),\n", + " ('放', 209383),\n", + " ('縣', 208835),\n", + " ('联', 208570),\n", + " ('武', 207668),\n", + " ('片', 207506),\n", + " ('則', 206183),\n", + " ('關', 205509),\n", + " ('密', 205338),\n", + " ('總', 204923),\n", + " ('統', 204720),\n", + " ('版', 204203),\n", + " ('伊', 204184),\n", + " ('商', 203050),\n", + " ('即', 202966),\n", + " ('她', 202386),\n", + " ('華', 201388),\n", + " ('樂', 200087),\n", + " ('們', 199599),\n", + " ('李', 199272),\n", + " ('先', 199023),\n", + " ('計', 198509),\n", + " ('括', 197604),\n", + " ('两', 197523),\n", + " ('收', 196319),\n", + " ('F', 196115),\n", + " ('屬', 196024),\n", + " ('無', 195249),\n", + " ('器', 194519),\n", + " ('运', 194433),\n", + " ('往', 193706),\n", + " ('知', 193689),\n", + " ('场', 193256),\n", + " ('去', 193095),\n", + " ('座', 193025),\n", + " ('組', 192983),\n", + " ('白', 192869),\n", + " ('产', 191056),\n", + " ('未', 190989),\n", + " ('果', 190398),\n", + " ('再', 189266),\n", + " ('根', 188412),\n", + " ('歌', 188362),\n", + " ('視', 188245),\n", + " ('初', 188093),\n", + " ('赛', 186268),\n", + " ('华', 186221),\n", + " ('足', 186158),\n", + " ('福', 185682),\n", + " ('六', 185381),\n", + " ('达', 185158),\n", + " ('组', 184960),\n", + " ('究', 184003),\n", + " ('v', 183837),\n", + " ('统', 183054),\n", + " ('且', 182916),\n", + " ('語', 182338),\n", + " ('今', 182157),\n", + " ('线', 182010),\n", + " ('级', 181119),\n", + " ('兵', 181058),\n", + " ('从', 180973),\n", + " ('引', 180509),\n", + " ('派', 180499),\n", + " ('從', 179255),\n", + " ('議', 179047),\n", + " ('居', 178918),\n", + " ('语', 178804),\n", + " ('亦', 178605),\n", + " ('完', 177535),\n", + " ('打', 177422),\n", + " ('帝', 176902),\n", + " ('域', 176620),\n", + " ('w', 176139),\n", + " ('们', 175906),\n", + " ('号', 175714),\n", + " ('參', 174989),\n", + " ('季', 174363),\n", + " ('宗', 174134),\n", + " ('塔', 174114),\n", + " ('关', 173075),\n", + " ('除', 171436),\n", + " ('播', 170322),\n", + " ('威', 169979),\n", + " ('书', 168981),\n", + " ('務', 168894),\n", + " ('花', 168407),\n", + " ('很', 168249),\n", + " ('團', 168210),\n", + " ('段', 168011),\n", + " ('令', 167846),\n", + " ('八', 167706),\n", + " ('服', 167562),\n", + " ('說', 167392),\n", + " ('實', 167336),\n", + " ('计', 167303),\n", + " ('島', 166734),\n", + " ('育', 166535),\n", + " ('義', 166489),\n", + " ('門', 165434),\n", + " ('速', 164945),\n", + " ('兰', 164821),\n", + " ('广', 164210),\n", + " ('言', 164091),\n", + " ('產', 163195),\n", + " ('結', 163155),\n", + " ('约', 162440),\n", + " ('土', 161667),\n", + " ('功', 161447),\n", + " ('另', 160843),\n", + " ('死', 160580),\n", + " ('置', 160481),\n", + " ('父', 159923),\n", + " ('修', 159640),\n", + " ('希', 159417),\n", + " ('好', 158431),\n", + " ('存', 158262),\n", + " ('周', 158136),\n", + " ('客', 158084),\n", + " ('類', 157931),\n", + " ('傳', 157902),\n", + " ('推', 157366),\n", + " ('超', 156866),\n", + " ('雷', 156703),\n", + " ('率', 155829),\n", + " ('宣', 155765),\n", + " ('V', 155153),\n", + " ('處', 154921),\n", + " ('洛', 154404),\n", + " ('查', 153766),\n", + " ('吉', 153490),\n", + " ('我', 153341),\n", + " ('供', 153324),\n", + " ('九', 152986),\n", + " ('投', 152576),\n", + " ('具', 152376),\n", + " ('哈', 152301),\n", + " ('班', 151926),\n", + " ('街', 151717),\n", + " ('源', 151678),\n", + " ('認', 151616),\n", + " ('单', 151615),\n", + " ('实', 151390),\n", + " ('單', 151304),\n", + " ('博', 150101),\n", + " ('青', 150046),\n", + " ('党', 149938),\n", + " ('獲', 149856),\n", + " ('K', 149744),\n", + " ('瓦', 149325),\n", + " ('母', 148890),\n", + " ('百', 148184),\n", + " ('湖', 147901),\n", + " ('务', 147810),\n", + " ('書', 147788),\n", + " ('求', 147572),\n", + " ('田', 147532),\n", + " ('維', 146892),\n", + " ('印', 146693),\n", + " ('男', 146232),\n", + " ('维', 145957),\n", + " ('廣', 145742),\n", + " ('張', 145614),\n", + " ('伯', 145295),\n", + " ('火', 144739),\n", + " ('攻', 144671),\n", + " ('己', 144525),\n", + " ('故', 144506),\n", + " ('还', 144361),\n", + " ('落', 144305),\n", + " ('例', 144222),\n", + " ('舉', 144188),\n", + " ('案', 143967),\n", + " ('z', 143613),\n", + " ('那', 143562),\n", + " ('勒', 143562),\n", + " ('別', 143269),\n", + " ('皇', 142883),\n", + " ('助', 142780),\n", + " ('沙', 142772),\n", + " ('份', 142710),\n", + " ('航', 142510),\n", + " ('义', 142469),\n", + " ('W', 142165),\n", + " ('帶', 141966),\n", + " ('参', 141832),\n", + " ('议', 141443),\n", + " ('变', 141044),\n", + " ('图', 140532),\n", + " ('臺', 140228),\n", + " ('早', 140153),\n", + " ('應', 139981),\n", + " ('技', 139788),\n", + " ('資', 139704),\n", + " ('致', 139268),\n", + " ('導', 139221),\n", + " ('龍', 139102),\n", + " ('整', 138777),\n", + " ('容', 138509),\n", + " ('增', 137959),\n", + " ('副', 137825),\n", + " ('模', 137700),\n", + " ('點', 137387),\n", + " ('萬', 137295),\n", + " ('半', 137221),\n", + " ('登', 137071),\n", + " ('劇', 136654),\n", + " ('專', 136511),\n", + " ('均', 136496),\n", + " ('群', 135618),\n", + " ('七', 135403),\n", + " ('积', 134739),\n", + " ('川', 134691),\n", + " ('说', 134684),\n", + " ('擊', 134586),\n", + " ('堂', 134532),\n", + " ('铁', 134373),\n", + " ('師', 134069),\n", + " ('选', 134038),\n", + " ('越', 133820),\n", + " ('万', 133369),\n", + " ('級', 133094),\n", + " ('游', 132824),\n", + " ('失', 132291),\n", + " ('处', 132117),\n", + " ('门', 131694),\n", + " ('變', 131688),\n", + " ('点', 131461),\n", + " ('告', 130988),\n", + " ('易', 130770),\n", + " ('食', 130723),\n", + " ('領', 130618),\n", + " ('乡', 130599),\n", + " ('氏', 130340),\n", + " ('奥', 129907),\n", + " ('辖', 129634),\n", + " ('深', 129567),\n", + " ('黨', 129285),\n", + " ('風', 129159),\n", + " ('境', 129148),\n", + " ('领', 129075),\n", + " ('结', 128312),\n", + " ('紀', 128223),\n", + " ('室', 127991),\n", + " ('传', 127796),\n", + " ('黑', 127771),\n", + " ('塞', 127038),\n", + " ('射', 127006),\n", + " ('排', 126751),\n", + " ('升', 126543),\n", + " ('应', 126535),\n", + " ('需', 126341),\n", + " ('导', 126328),\n", + " ('限', 125959),\n", + " ('甲', 125958),\n", + " ('頭', 125448),\n", + " ('製', 125108),\n", + " ('团', 125104),\n", + " ('節', 125019),\n", + " ('乐', 124927),\n", + " ('連', 124841),\n", + " ('轉', 124586),\n", + " ('象', 124562),\n", + " ('像', 124360),\n", + " ('老', 124289),\n", + " ('无', 123544),\n", + " ('央', 123394),\n", + " ('真', 123086),\n", + " ('沒', 123072),\n", + " ('畫', 122604),\n", + " ('何', 122439),\n", + " ('U', 122366),\n", + " ('苏', 122365),\n", + " ('才', 122224),\n", + " ('类', 122219),\n", + " ('托', 121767),\n", + " ('仍', 121736),\n", + " ('获', 121698),\n", + " ('木', 121590),\n", + " ('便', 120778),\n", + " ('据', 120454),\n", + " ('洋', 120050),\n", + " ('鐵', 120011),\n", + " ('步', 119954),\n", + " ('牙', 119825),\n", + " ('森', 119639),\n", + " ('條', 119443),\n", + " ('報', 119391),\n", + " ('盟', 119374),\n", + " ('際', 119188),\n", + " ('奇', 119154),\n", + " ('氣', 118708),\n", + " ('權', 118602),\n", + " ('考', 118330),\n", + " ('席', 118307),\n", + " ('想', 118279),\n", + " ('魚', 118272),\n", + " ('積', 118008),\n", + " ('施', 117372),\n", + " ('感', 116760),\n", + " ('配', 116591),\n", + " ('哥', 116487),\n", + " ('J', 116467),\n", + " ('標', 116329),\n", + " ('決', 116227),\n", + " ('低', 116133),\n", + " ('奧', 115885),\n", + " ('論', 115797),\n", + " ('防', 115721),\n", + " ('纳', 115188),\n", + " ('师', 114762),\n", + " ('住', 114045),\n", + " ('效', 113893),\n", + " ('留', 113782),\n", + " ('止', 113746),\n", + " ('术', 113445),\n", + " ('律', 113314),\n", + " ('认', 113285),\n", + " ('姆', 113277),\n", + " ('把', 113095),\n", + " ('历', 112260),\n", + " ('底', 112207),\n", + " ('记', 111883),\n", + " ('還', 111668),\n", + " ('術', 111615),\n", + " ('埃', 111571),\n", + " ('泰', 111285),\n", + " ('親', 111130),\n", + " ('料', 111078),\n", + " ('曼', 110629),\n", + " ('病', 110519),\n", + " ('離', 110513),\n", + " ('冠', 110329),\n", + " ('佛', 110326),\n", + " ('视', 110308),\n", + " ('风', 110245),\n", + " ('纪', 110175),\n", + " ('阳', 109809),\n", + " ('千', 109448),\n", + " ('恩', 109346),\n", + " ('警', 109119),\n", + " ('據', 108854),\n", + " ('志', 108645),\n", + " ('降', 108596),\n", + " ('景', 108453),\n", + " ('看', 108416),\n", + " ('蒙', 108306),\n", + " ('报', 108276),\n", + " ('息', 107990),\n", + " ('俄', 107958),\n", + " ('強', 107936),\n", + " ('迪', 107648),\n", + " ('祖', 107637),\n", + " ('創', 107082),\n", + " ('察', 107009),\n", + " ('唱', 106912),\n", + " ('移', 106740),\n", + " ('納', 106718),\n", + " ('素', 106534),\n", + " ('圖', 106375),\n", + " ('着', 106345),\n", + " ('批', 105772),\n", + " ('张', 105675),\n", + " ('康', 105597),\n", + " ('革', 105447),\n", + " ('負', 105360),\n", + " ('職', 105179),\n", + " ('则', 104399),\n", + " ('愛', 104300),\n", + " ('消', 104273),\n", + " ('算', 104202),\n", + " ('承', 104198),\n", + " ('觀', 104137),\n", + " ('索', 104088),\n", + " ('較', 104086),\n", + " ('带', 104018),\n", + " ('授', 103682),\n", + " ('典', 103668),\n", + " ('_', 103620),\n", + " ('控', 103423),\n", + " ('遊', 103323),\n", + " ('念', 103024),\n", + " ('續', 102958),\n", + " ('藏', 102923),\n", + " ('汉', 101961),\n", + " ('专', 101882),\n", + " ('獎', 101672),\n", + " ('思', 101588),\n", + " ('許', 101259),\n", + " ('标', 100979),\n", + " ('記', 100883),\n", + " ('票', 100548),\n", + " ('責', 100538),\n", + " ('守', 100446),\n", + " ('做', 100439),\n", + " ('友', 100220),\n", + " ('岛', 99854),\n", + " ('论', 99377),\n", + " ('没', 98977),\n", + " ('見', 98953),\n", + " ('精', 98863),\n", + " ('际', 98797),\n", + " ('别', 98091),\n", + " ('瑞', 97896),\n", + " ('裝', 97284),\n", + " ('督', 96375),\n", + " ('附', 96230),\n", + " ('望', 95809),\n", + " ('題', 95646),\n", + " ('陸', 95179),\n", + " ('依', 95157),\n", + " ('錄', 94822),\n", + " ('照', 94775),\n", + " ('房', 94743),\n", + " ('夏', 94681),\n", + " ('質', 94661),\n", + " ('述', 94433),\n", + " ('办', 94390),\n", + " ('转', 94371),\n", + " ('園', 94359),\n", + " ('毛', 94208),\n", + " ('草', 93824),\n", + " ('拔', 93588),\n", + " ('章', 93577),\n", + " ('圣', 93412),\n", + " ('抗', 93149),\n", + " ('編', 93136),\n", + " ('歷', 93082),\n", + " ('击', 92879),\n", + " ('资', 92821),\n", + " ('調', 92575),\n", + " ('权', 92382),\n", + " ('網', 92357),\n", + " ('条', 91960),\n", + " ('职', 91787),\n", + " ('节', 91659),\n", + " ('气', 91538),\n", + " ('封', 91341),\n", + " ('决', 91294),\n", + " ('富', 91278),\n", + " ('退', 91039),\n", + " ('般', 90979),\n", + " ('停', 90904),\n", + " ('澳', 90811),\n", + " ('左', 90752),\n", + " ('蘇', 90677),\n", + " ('永', 90655),\n", + " ('極', 90650),\n", + " ('复', 90497),\n", + " ('辦', 90486),\n", + " ('丹', 90361),\n", + " ('唐', 90129),\n", + " ('x', 89981),\n", + " ('陆', 89933),\n", + " ('試', 89718),\n", + " ('堡', 89589),\n", + " ('戲', 89325),\n", + " ('爭', 89189),\n", + " ('丁', 89186),\n", + " ('陳', 89112),\n", + " ('牌', 89029),\n", + " ('构', 88981),\n", + " ('值', 88924),\n", + " ('拿', 88753),\n", + " ('快', 88628),\n", + " ('終', 88618),\n", + " ('隨', 88511),\n", + " ('船', 88483),\n", + " ('歐', 88391),\n", + " ('營', 88363),\n", + " ('候', 88347),\n", + " ('聖', 88231),\n", + " ('坦', 88094),\n", + " ('破', 87862),\n", + " ('隆', 87850),\n", + " ('梅', 87461),\n", + " ('寺', 87390),\n", + " ('給', 87337),\n", + " ('邦', 87005),\n", + " ('植', 86765),\n", + " ('郡', 86725),\n", + " ('鎮', 86584),\n", + " ('萨', 86424),\n", + " ('擔', 86366),\n", + " ('莱', 85980),\n", + " ('莫', 85573),\n", + " ('協', 85570),\n", + " ('含', 85530),\n", + " ('调', 85358),\n", + " ('問', 85127),\n", + " ('黃', 84929),\n", + " ('似', 84723),\n", + " ('突', 84699),\n", + " ('皮', 84610),\n", + " ('创', 84203),\n", + " ('必', 83904),\n", + " ('興', 83729),\n", + " ('尚', 83459),\n", + " ('准', 83283),\n", + " ('旅', 83109),\n", + " ('强', 83039),\n", + " ('眾', 83019),\n", + " ('蒂', 82986),\n", + " ('走', 82714),\n", + " ('項', 82683),\n", + " ('举', 82639),\n", + " ('离', 82524),\n", + " ('注', 82492),\n", + " ('剧', 82482),\n", + " ('久', 82239),\n", + " ('争', 82222),\n", + " ('嘉', 81692),\n", + " ('岸', 81671),\n", + " ('济', 81487),\n", + " ('樣', 81427),\n", + " ('藝', 81322),\n", + " ('划', 81297),\n", + " ('切', 81164),\n", + " ('右', 80824),\n", + " ('飛', 80759),\n", + " ('兒', 80726),\n", + " ('头', 80687),\n", + " ('店', 80650),\n", + " ('延', 80407),\n", + " ('朗', 80300),\n", + " ('轄', 80261),\n", + " ('网', 80054),\n", + " ('漢', 79838),\n", + " ('鄉', 79658),\n", + " ('随', 79617),\n", + " ('规', 79594),\n", + " ('野', 79035),\n", + " ('架', 79007),\n", + " ('弟', 78994),\n", + " ('熱', 78903),\n", + " ('邊', 78863),\n", + " ('龙', 78739),\n", + " ('倫', 78712),\n", + " ('春', 78513),\n", + " ('装', 78421),\n", + " ('连', 78387),\n", + " ('核', 78315),\n", + " ('售', 78277),\n", + " ('给', 78244),\n", + " ('衛', 78045),\n", + " ('护', 77913),\n", + " ('館', 77769),\n", + " ('亡', 77693),\n", + " ('規', 77647),\n", + " ('协', 77539),\n", + " ('质', 77406),\n", + " ('油', 77375),\n", + " ('编', 77293),\n", + " ('環', 77090),\n", + " ('艾', 76833),\n", + " ('層', 76828),\n", + " ('備', 76649),\n", + " ('讓', 76635),\n", + " ('構', 76365),\n", + " ('佳', 76056),\n", + " ('弗', 76030),\n", + " ('占', 75999),\n", + " ('判', 75750),\n", + " ('黄', 75658),\n", + " ('雄', 75547),\n", + " ('X', 75528),\n", + " ('繼', 75474),\n", + " ('筑', 75385),\n", + " ('端', 75135),\n", + " ('鲁', 75054),\n", + " ('短', 74940),\n", + " ('股', 74936),\n", + " ('题', 74471),\n", + " ('圍', 74137),\n", + " ('耶', 74110),\n", + " ('卫', 73704),\n", + " ('许', 73696),\n", + " ('遭', 73526),\n", + " ('戏', 73441),\n", + " ('輯', 73372),\n", + " ('问', 73239),\n", + " ('護', 73075),\n", + " ('宁', 72970),\n", + " ('策', 72800),\n", + " ('薩', 72779),\n", + " ('雙', 72740),\n", + " ('样', 72679),\n", + " ('逐', 72434),\n", + " ('松', 72320),\n", + " ('赫', 72312),\n", + " ('担', 72226),\n", + " ('馆', 72177),\n", + " ('評', 72162),\n", + " ('观', 71822),\n", + " ('帕', 71758),\n", + " ('距', 71703),\n", + " ('役', 71667),\n", + " ('聲', 71563),\n", + " ('摩', 71501),\n", + " ...]" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'all_wiki_content' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mdel\u001b[0m \u001b[0mall_wiki_content\u001b[0m \u001b[1;31m# 内存不够\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mgram_length\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mtwo_gram_counts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCounter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mall_character\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0mgram_length\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mall_character\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mgram_length\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'all_wiki_content' is not defined" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'新华': 135490,\n", + " '华社': 129104,\n", + " '社照': 45003,\n", + " '照片': 46712,\n", + " '片东': 235,\n", + " '东莞': 1543,\n", + " '莞广': 195,\n", + " '广东': 5182,\n", + " '东2': 922,\n", + " '20': 123427,\n", + " '01': 102583,\n", + " '17': 81801,\n", + " '7年': 59051,\n", + " '年4': 21904,\n", + " '4月': 51236,\n", + " '月7': 3777,\n", + " '7日': 12919,\n", + " '日n': 48360,\n", + " 'n体': 18616,\n", + " '体育': 27169,\n", + " '育9': 438,\n", + " '9篮': 75,\n", + " '篮球': 5154,\n", + " '球C': 416,\n", + " 'CB': 1003,\n", + " 'BA': 5552,\n", + " 'A总': 939,\n", + " '总决': 1968,\n", + " '决赛': 16108,\n", + " '赛第': 6484,\n", + " '第四': 2788,\n", + " '四场': 474,\n", + " '场广': 176,\n", + " '东对': 83,\n", + " '对阵': 2153,\n", + " '阵新': 88,\n", + " '新疆': 4703,\n", + " '疆n': 134,\n", + " 'n4': 23273,\n", + " '日广': 727,\n", + " '东东': 634,\n", + " '莞银': 555,\n", + " '银行': 9538,\n", + " '行队': 935,\n", + " '队球': 10498,\n", + " '球员': 13102,\n", + " '员易': 71,\n", + " '易建': 110,\n", + " '建联': 161,\n", + " '联在': 98,\n", + " '在比': 9666,\n", + " '比赛': 29313,\n", + " '赛中': 26877,\n", + " '中扣': 112,\n", + " '扣篮': 118,\n", + " '篮n': 720,\n", + " 'n当': 25000,\n", + " '当日': 24291,\n", + " '日在': 39495,\n", + " '在2': 9127,\n", + " '16': 25070,\n", + " '62': 7886,\n", + " '7赛': 7121,\n", + " '赛季': 9782,\n", + " '季中': 1411,\n", + " '中国': 77776,\n", + " '国男': 1192,\n", + " '男子': 5085,\n", + " '子篮': 421,\n", + " '球职': 422,\n", + " '职业': 3435,\n", + " '业联': 791,\n", + " '联赛': 18165,\n", + " '赛C': 429,\n", + " '场比': 2857,\n", + " '中广': 562,\n", + " '队主': 5662,\n", + " '主场': 5943,\n", + " '场迎': 386,\n", + " '迎战': 618,\n", + " '战新': 164,\n", + " '疆喀': 764,\n", + " '喀什': 1017,\n", + " '什古': 620,\n", + " '古城': 1428,\n", + " '城队': 1579,\n", + " '队n': 7145,\n", + " 'n新': 78433,\n", + " '社记': 27920,\n", + " '记者': 56946,\n", + " '者孟': 238,\n", + " '孟永': 121,\n", + " '永民': 122,\n", + " '民摄': 220,\n", + " '摄n': 11579,\n", + " '社北': 5509,\n", + " '北京': 27639,\n", + " '京4': 1073,\n", + " '4月': 7783,\n", + " '月1': 6440,\n", + " '14': 4021,\n", + " '4日': 3202,\n", + " '日新': 3927,\n", + " '新媒': 2760,\n", + " '媒体': 11080,\n", + " '体专': 2203,\n", + " '专电': 2123,\n", + " '电记': 11810,\n", + " '者杨': 1405,\n", + " '杨烨': 18,\n", + " '烨作': 4,\n", + " '作为': 9679,\n", + " '为国': 1607,\n", + " '国民': 2752,\n", + " '民经': 220,\n", + " '经济': 25304,\n", + " '济的': 973,\n", + " '的重': 7007,\n", + " '重要': 14453,\n", + " '要支': 278,\n", + " '支柱': 351,\n", + " '柱央': 2,\n", + " '央企': 698,\n", + " '企一': 23,\n", + " '一季': 2660,\n", + " '季度': 3356,\n", + " '度交': 57,\n", + " '交上': 55,\n", + " '上了': 1557,\n", + " '了一': 8259,\n", + " '一份': 1723,\n", + " '份漂': 3,\n", + " '漂亮': 340,\n", + " '亮的': 370,\n", + " '的成': 3755,\n", + " '成绩': 3672,\n", + " '绩单': 148,\n", + " '单国': 11,\n", + " '国资': 662,\n", + " '资委': 322,\n", + " '委总': 32,\n", + " '总会': 325,\n", + " '会计': 404,\n", + " '计师': 2453,\n", + " '师沈': 17,\n", + " '沈莹': 26,\n", + " '莹在': 45,\n", + " '在1': 849,\n", + " '13': 4147,\n", + " '3日': 3456,\n", + " '日国': 1034,\n", + " '国新': 1874,\n", + " '新办': 233,\n", + " '办新': 88,\n", + " '新闻': 11645,\n", + " '闻发': 1950,\n", + " '发布': 10817,\n", + " '布会': 3455,\n", + " '会上': 5442,\n", + " '上表': 967,\n", + " '表示': 20215,\n", + " '示一': 618,\n", + " '度中': 340,\n", + " '中央': 7244,\n", + " '企业': 27381,\n", + " '业累': 52,\n", + " '累计': 1961,\n", + " '计实': 119,\n", + " '实现': 10625,\n", + " '现利': 112,\n", + " '利润': 1653,\n", + " '润总': 99,\n", + " '总额': 939,\n", + " '额3': 23,\n", + " '31': 2316,\n", + " '12': 6255,\n", + " '20': 24199,\n", + " '0亿': 1184,\n", + " '亿元': 8923,\n", + " '元同': 1121,\n", + " '同比': 3888,\n", + " '比增': 2439,\n", + " '增长': 10003,\n", + " '长2': 316,\n", + " '23': 2952,\n", + " '32': 1293,\n", + " '3月': 2125,\n", + " '月当': 95,\n", + " '当月': 207,\n", + " '月利': 25,\n", + " '额达': 428,\n", + " '达到': 4860,\n", + " '到历': 52,\n", + " '历史': 7278,\n", + " '史同': 48,\n", + " '同期': 1240,\n", + " '期最': 119,\n", + " '最好': 1240,\n", + " '好水': 68,\n", + " '水平': 5432,\n", + " '平1': 34,\n", + " '10': 11467,\n", + " '02': 2368,\n", + " '2家': 122,\n", + " '家中': 1399,\n", + " '业中': 830,\n", + " '中有': 1733,\n", + " '有9': 119,\n", + " '99': 1404,\n", + " '9家': 80,\n", + " '家盈': 11,\n", + " '盈利': 805,\n", + " '利8': 5,\n", + " '81': 602,\n", + " '1家': 102,\n", + " '家企': 1095,\n", + " '业增': 474,\n", + " '增利': 30,\n", + " '利4': 12,\n", + " '43': 826,\n", + " '3家': 112,\n", + " '业效': 87,\n", + " '效益': 820,\n", + " '益增': 153,\n", + " '增幅': 560,\n", + " '幅超': 190,\n", + " '超过': 6453,\n", + " '过1': 607,\n", + " '0一': 21,\n", + " '一些': 8372,\n", + " '些长': 22,\n", + " '长期': 3531,\n", + " '期亏': 23,\n", + " '亏损': 438,\n", + " '损的': 118,\n", + " '的企': 1465,\n", + " '业实': 372,\n", + " '现扭': 10,\n", + " '扭亏': 61,\n", + " '亏为': 40,\n", + " '为盈': 44,\n", + " '盈n': 24,\n", + " 'n值': 276,\n", + " '值得': 1745,\n", + " '得注': 396,\n", + " '注意': 2000,\n", + " '意的': 956,\n", + " '的是': 6371,\n", + " '是她': 369,\n", + " '她同': 61,\n", + " '同时': 10754,\n", + " '时透': 47,\n", + " '透露': 1413,\n", + " '露了': 194,\n", + " '了下': 257,\n", + " '下一': 3928,\n", + " '一步': 8608,\n", + " '步央': 4,\n", + " '企改': 193,\n", + " '改革': 10095,\n", + " '革重': 72,\n", + " '重点': 5668,\n", + " '点国': 79,\n", + " '委将': 90,\n", + " '将加': 535,\n", + " '加快': 3248,\n", + " '快推': 455,\n", + " '推动': 8002,\n", + " '动中': 1943,\n", + " '业战': 43,\n", + " '战略': 5892,\n", + " '略性': 391,\n", + " '性重': 44,\n", + " '重组': 796,\n", + " '组加': 16,\n", + " '快钢': 6,\n", + " '钢铁': 893,\n", + " '铁煤': 76,\n", + " '煤炭': 822,\n", + " '炭煤': 10,\n", + " '煤化': 59,\n", + " '化工': 799,\n", + " '工等': 204,\n", + " '等业': 146,\n", + " '业务': 5239,\n", + " '务的': 1911,\n", + " '的专': 1370,\n", + " '专业': 5770,\n", + " '业化': 1152,\n", + " '化整': 61,\n", + " '整合': 1203,\n", + " '合n': 311,\n", + " 'n一': 5965,\n", + " '现净': 97,\n", + " '净利': 648,\n", + " '润2': 7,\n", + " '22': 2948,\n", + " '26': 2599,\n", + " '64': 769,\n", + " '42': 1006,\n", + " '2亿': 333,\n", + " '65': 973,\n", + " '5其': 18,\n", + " '其中': 9548,\n", + " '中归': 5,\n", + " '归属': 179,\n", + " '属于': 1777,\n", + " '于母': 28,\n", + " '母公': 126,\n", + " '公司': 26427,\n", + " '司所': 76,\n", + " '所有': 4421,\n", + " '有者': 127,\n", + " '者的': 2395,\n", + " '的净': 163,\n", + " '润1': 10,\n", + " '27': 2077,\n", + " '7亿': 299,\n", + " '24': 2719,\n", + " '46': 745,\n", + " '6沈': 1,\n", + " '莹告': 6,\n", + " '告诉': 3787,\n", + " '诉记': 2063,\n", + " '者其': 136,\n", + " '中石': 438,\n", + " '石油': 1560,\n", + " '油石': 64,\n", + " '石化': 551,\n", + " '化钢': 15,\n", + " '铁有': 71,\n", + " '有色': 220,\n", + " '色煤': 9,\n", + " '炭等': 29,\n", + " '等传': 198,\n", + " '传统': 7639,\n", + " '统产': 223,\n", + " '产业': 14123,\n", + " '益逐': 10,\n", + " '逐步': 1891,\n", + " '步回': 101,\n", + " '回升': 609,\n", + " '升先': 12,\n", + " '先进': 2113,\n", + " '进制': 199,\n", + " '制造': 6122,\n", + " '造业': 1820,\n", + " '业医': 109,\n", + " '医药': 1460,\n", + " '药现': 15,\n", + " '现代': 3155,\n", + " '代服': 177,\n", + " '服务': 20067,\n", + " '务业': 917,\n", + " '业等': 654,\n", + " '等行': 461,\n", + " '行业': 7341,\n", + " '益贡': 7,\n", + " '贡献': 2324,\n", + " '献稳': 6,\n", + " '稳步': 608,\n", + " '步提': 641,\n", + " '提升': 6100,\n", + " '升n': 493,\n", + " 'n在': 7524,\n", + " '在她': 300,\n", + " '她看': 81,\n", + " '看来': 1415,\n", + " '来央': 42,\n", + " '企利': 16,\n", + " '润大': 15,\n", + " '大幅': 2134,\n", + " '幅回': 168,\n", + " '升一': 72,\n", + " '一是': 1406,\n", + " '是抓': 72,\n", + " '抓住': 762,\n", + " '住了': 478,\n", + " '了大': 1483,\n", + " '大宗': 542,\n", + " '宗商': 375,\n", + " '商品': 3353,\n", + " '品市': 225,\n", + " '市场': 22666,\n", + " '场回': 59,\n", + " '回调': 167,\n", + " '调的': 258,\n", + " '的有': 1715,\n", + " '有利': 1779,\n", + " '利机': 60,\n", + " '机遇': 1964,\n", + " '遇加': 36,\n", + " '加大': 2469,\n", + " '大生': 181,\n", + " '生产': 8724,\n", + " '产组': 32,\n", + " '组织': 12481,\n", + " '织力': 30,\n", + " '力度': 2151,\n", + " '业抢': 13,\n", + " '抢抓': 162,\n", + " '抓市': 6,\n", + " '场机': 157,\n", + " '遇在': 41,\n", + " '在价': 85,\n", + " '价格': 6823,\n", + " '格回': 47,\n", + " '升的': 375,\n", + " '的时': 5529,\n", + " '时候': 3767,\n", + " '候加': 5,\n", + " '大排': 48,\n", + " '排产': 6,\n", + " '产力': 158,\n", + " '度提': 249,\n", + " '提高': 6118,\n", + " '高生': 81,\n", + " '产负': 219,\n", + " '负荷': 146,\n", + " '荷抓': 2,\n", + " '住市': 18,\n", + " '场的': 3422,\n", + " '的窗': 157,\n", + " '窗口': 910,\n", + " '口期': 62,\n", + " '期提': 97,\n", + " '高产': 171,\n", + " '产销': 302,\n", + " '销量': 659,\n", + " '量二': 14,\n", + " '二是': 1134,\n", + " '是得': 85,\n", + " '得益': 438,\n", + " '益于': 536,\n", + " '于加': 458,\n", + " '大供': 22,\n", + " '供给': 1792,\n", + " '给侧': 1008,\n", + " '侧结': 710,\n", + " '结构': 3804,\n", + " '构性': 1021,\n", + " '性改': 809,\n", + " '革力': 32,\n", + " '度特': 36,\n", + " '特别': 4664,\n", + " '别是': 2253,\n", + " '是钢': 22,\n", + " '炭去': 37,\n", + " '去产': 345,\n", + " '产能': 1973,\n", + " '能工': 228,\n", + " '工作': 25311,\n", + " '作在': 506,\n", + " '在2': 1640,\n", + " '01': 15505,\n", + " '16': 7193,\n", + " '6年': 4210,\n", + " '年取': 53,\n", + " '取得': 4713,\n", + " '得了': 1941,\n", + " '了很': 1071,\n", + " '很好': 905,\n", + " '好的': 4474,\n", + " '的进': 1309,\n", + " '进展': 1459,\n", + " '展化': 11,\n", + " '化解': 545,\n", + " '解钢': 15,\n", + " '铁过': 16,\n", + " '过剩': 438,\n", + " '剩产': 143,\n", + " '能1': 30,\n", + " '19': 5692,\n", + " '9万': 239,\n", + " '万吨': 1402,\n", + " '吨化': 7,\n", + " '解煤': 22,\n", + " '炭过': 21,\n", + " '能3': 12,\n", + " '34': 932,\n", + " '49': 673,\n", + " '97': 882,\n", + " '7万': 380,\n", + " '吨均': 10,\n", + " '均超': 140,\n", + " '超额': 94,\n", + " '额完': 53,\n", + " '完成': 6661,\n", + " '成年': 907,\n", + " '年度': 2106,\n", + " '度任': 17,\n", + " '任务': 4218,\n", + " '务提': 190,\n", + " '高了': 646,\n", + " '了先': 56,\n", + " '进产': 173,\n", + " '能的': 910,\n", + " '的利': 884,\n", + " '利用': 5295,\n", + " '用效': 126,\n", + " '效率': 1430,\n", + " '率稳': 39,\n", + " '稳定': 4016,\n", + " '定了': 1408,\n", + " '了市': 335,\n", + " '场局': 12,\n", + " '局面': 893,\n", + " '面促': 40,\n", + " '促进': 5413,\n", + " '进了': 929,\n", + " '了行': 123,\n", + " '业健': 168,\n", + " '健康': 5468,\n", + " '康发': 585,\n", + " '发展': 36410,\n", + " '展n': 3577,\n", + " 'n改': 227,\n", + " '革积': 20,\n", + " '积极': 6711,\n", + " '极成': 82,\n", + " '成效': 986,\n", + " '效已': 11,\n", + " '已经': 10854,\n", + " '经显': 24,\n", + " '显现': 471,\n", + " '现特': 47,\n", + " '是积': 79,\n", + " '极开': 232,\n", + " '开展': 8895,\n", + " '展重': 101,\n", + " '组和': 219,\n", + " '和调': 171,\n", + " '调整': 3644,\n", + " '整工': 17,\n", + " '作提': 344,\n", + " '了存': 19,\n", + " '存量': 392,\n", + " '量资': 218,\n", + " '资源': 7018,\n", + " '源的': 629,\n", + " '的配': 352,\n", + " '配置': 1034,\n", + " '置效': 25,\n", + " '率也': 210,\n", + " '也是': 6373,\n", + " '是央': 35,\n", + " '润回': 10,\n", + " '升非': 14,\n", + " '非常': 4885,\n", + " '常重': 409,\n", + " '要的': 3469,\n", + " '的原': 1841,\n", + " '原因': 3356,\n", + " '因沈': 2,\n", + " '莹说': 77,\n", + " '说她': 298,\n", + " '她进': 23,\n", + " '进一': 7264,\n", + " '步表': 27,\n", + " '示去': 126,\n", + " '去年': 6315,\n", + " '年宝': 14,\n", + " '宝钢': 32,\n", + " '钢和': 13,\n", + " '和武': 204,\n", + " '武钢': 61,\n", + " '钢进': 2,\n", + " '进行': 23528,\n", + " '行重': 165,\n", + " '组成': 2705,\n", + " '成立': 4345,\n", + " '立了': 1774,\n", + " '了宝': 80,\n", + " '宝武': 17,\n", + " '武集': 15,\n", + " '集团': 7298,\n", + " '团新': 50,\n", + " '新集': 32,\n", + " '团成': 280,\n", + " '立以': 397,\n", + " '以后': 1155,\n", + " '后可': 536,\n", + " '可以': 11898,\n", + " '以减': 133,\n", + " '减少': 2940,\n", + " '少重': 28,\n", + " '重复': 431,\n", + " '复建': 67,\n", + " '建设': 18606,\n", + " '设同': 57,\n", + " '时也': 1494,\n", + " '也减': 18,\n", + " '少了': 427,\n", + " '了管': 46,\n", + " '管理': 12621,\n", + " '理成': 97,\n", + " '成本': 3543,\n", + " '本提': 97,\n", + " '了效': 24,\n", + " '率再': 14,\n", + " '再比': 34,\n", + " '比如': 1893,\n", + " '如煤': 7,\n", + " '炭行': 41,\n", + " '业去': 91,\n", + " '年调': 54,\n", + " '整力': 6,\n", + " '度也': 257,\n", + " '也很': 574,\n", + " '很大': 1591,\n", + " '大成': 350,\n", + " '了煤': 17,\n", + " '炭资': 22,\n", + " '源整': 89,\n", + " '合平': 31,\n", + " '平台': 9763,\n", + " '台已': 65,\n", + " '经取': 89,\n", + " '效n': 339,\n", + " 'n据': 5941,\n", + " '据了': 2169,\n", + " '了解': 7101,\n", + " '解2': 43,\n", + " '年尤': 22,\n", + " '尤其': 1835,\n", + " '其是': 1390,\n", + " '是下': 128,\n", + " '下半': 1073,\n", + " '半年': 1692,\n", + " '年以': 2810,\n", + " '以来': 7949,\n", + " '企间': 4,\n", + " '间重': 50,\n", + " '组整': 33,\n", + " '合案': 12,\n", + " '案例': 780,\n", + " '例纷': 2,\n", + " '纷至': 62,\n", + " '至沓': 62,\n", + " '沓来': 62,\n", + " '来继': 33,\n", + " '继7': 1,\n", + " '7月': 902,\n", + " '11': 5998,\n", + " '1日': 4770,\n", + " '日中': 3695,\n", + " '国港': 89,\n", + " '港中': 340,\n", + " '中旅': 53,\n", + " '旅集': 28,\n", + " '团公': 523,\n", + " '司与': 286,\n", + " '与中': 1989,\n", + " '国国': 4636,\n", + " '国旅': 374,\n", + " '团有': 459,\n", + " '有限': 5842,\n", + " '限公': 4634,\n", + " '司发': 257,\n", + " '布公': 321,\n", + " '公告': 2147,\n", + " '告称': 503,\n", + " '称实': 58,\n", + " '实施': 7568,\n", + " '施战': 20,\n", + " '略重': 42,\n", + " '组之': 11,\n", + " '之后': 4283,\n", + " '后7': 13,\n", + " '委罕': 2,\n", + " '罕见': 406,\n", + " '见做': 6,\n", + " '做加': 31,\n", + " '加法': 114,\n", + " '法组': 37,\n", + " '组建': 1027,\n", + " '建中': 322,\n", + " '国航': 1289,\n", + " '航空': 4126,\n", + " '空发': 80,\n", + " '发动': 1184,\n", + " '动机': 756,\n", + " '机集': 59,\n", + " '团7': 4,\n", + " '15': 5946,\n", + " '5日': 3773,\n", + " '中粮': 76,\n", + " '粮集': 21,\n", + " '国中': 1179,\n", + " '中纺': 6,\n", + " '纺集': 2,\n", + " '司实': 145,\n", + " '组8': 8,\n", + " '8月': 501,\n", + " '月2': 6302,\n", + " '2日': 3333,\n", + " '国建': 769,\n", + " '建筑': 3808,\n", + " '筑材': 61,\n", + " '材料': 2236,\n", + " '料集': 11,\n", + " '中材': 4,\n", + " '材集': 9,\n", + " '施重': 59,\n", + " '组9': 3,\n", + " '9月': 613,\n", + " '日宝': 21,\n", + " '钢集': 71,\n", + " '与武': 70,\n", + " '武汉': 4198,\n", + " '汉钢': 12,\n", + " '铁集': 84,\n", + " '组1': 32,\n", + " '1月': 1464,\n", + " '国储': 34,\n", + " '储备': 683,\n", + " '备粮': 16,\n", + " '粮管': 9,\n", + " '理总': 149,\n", + " '总公': 227,\n", + " '备棉': 2,\n", + " '棉管': 2,\n", + " '组除': 6,\n", + " '除此': 198,\n", + " '此之': 395,\n", + " '之外': 1141,\n", + " '外1': 29,\n", + " '0日': 3246,\n", + " '国恒': 9,\n", + " '恒天': 25,\n", + " '天与': 118,\n", + " '国机': 286,\n", + " '机械': 1369,\n", + " '械工': 75,\n", + " '工业': 5552,\n", + " '业集': 760,\n", + " '司签': 164,\n", + " '签署': 2247,\n", + " '署重': 11,\n", + " '组协': 7,\n", + " '协议': 4183,\n", + " '议中': 622,\n", + " '天整': 13,\n", + " '整体': 2324,\n", + " '体产': 203,\n", + " '产权': 1615,\n", + " '权将': 27,\n", + " '将无': 113,\n", + " '无偿': 134,\n", + " '偿划': 8,\n", + " '划转': 51,\n", + " '转进': 14,\n", + " '进入': 8197,\n", + " '入国': 444,\n", + " '团截': 3,\n", + " '截至': 3481,\n", + " '至目': 789,\n", + " '目前': 15296,\n", + " '前由': 130,\n", + " '由国': 468,\n", + " '委直': 7,\n", + " '直接': 3826,\n", + " '接监': 8,\n", + " '监管': 5116,\n", + " '管的': 407,\n", + " '的中': 6759,\n", + " '业数': 214,\n", + " '数量': 2598,\n", + " '量已': 283,\n", + " '经降': 37,\n", + " '降至': 539,\n", + " '至1': 891,\n", + " '家n': 847,\n", + " 'n沈': 111,\n", + " '莹坦': 2,\n", + " '坦言': 378,\n", + " '言下': 21,\n", + " '企经': 36,\n", + " '经营': 4337,\n", + " '营仍': 5,\n", + " '仍面': 109,\n", + " '面临': 2798,\n", + " '临着': 237,\n", + " '着很': 95,\n", + " '很多': 5336,\n", + " '多不': 222,\n", + " '不确': 565,\n", + " '确定': 2481,\n", + " '定和': 451,\n", + " '和不': 328,\n", + " '不稳': 291,\n", + " '定因': 139,\n", + " '因素': 2170,\n", + " '素下': 3,\n", + " '步效': 3,\n", + " '增速': 1786,\n", + " '速可': 46,\n", + " '可能': 9800,\n", + " '能有': 725,\n", + " '有所': 1861,\n", + " '所放': 80,\n", + " '放缓': 587,\n", + " '缓一': 12,\n", + " '是国': 1618,\n", + " '国际': 38923,\n", + " '际市': 394,\n", + " '场不': 297,\n", + " '定性': 962,\n", + " '性较': 157,\n", + " '较大': 1674,\n", + " '大国': 1087,\n", + " '际政': 53,\n", + " '政治': 5158,\n", + " '治动': 24,\n", + " '动荡': 240,\n", + " '荡加': 10,\n", + " '加剧': 405,\n", + " '剧贸': 2,\n", + " '贸易': 6135,\n", + " '易摩': 25,\n", + " '摩擦': 138,\n", + " '擦增': 3,\n", + " '增多': 501,\n", + " '多汇': 3,\n", + " '汇率': 1302,\n", + " '率大': 179,\n", + " '品价': 373,\n", + " '格等': 62,\n", + " '等波': 4,\n", + " '波动': 659,\n", + " '动加': 125,\n", + " '产成': 142,\n", + " '本和': 308,\n", + " '和融': 98,\n", + " '融资': 2982,\n", + " '资成': 179,\n", + " '本上': 584,\n", + " '上升': 2063,\n", + " '升压': 10,\n", + " '压力': 2576,\n", + " '力较': 190,\n", + " '大二': 95,\n", + " '是中': 3672,\n", + " '业解': 61,\n", + " '解决': 5182,\n", + " '决历': 8,\n", + " '史遗': 86,\n", + " '遗留': 106,\n", + " '留问': 68,\n", + " '问题': 16448,\n", + " '题化': 21,\n", + " '解过': 96,\n", + " '能处': 43,\n", + " '处置': 974,\n", + " '置僵': 21,\n", + " '僵尸': 102,\n", + " '尸企': 62,\n", + " '等方': 2750,\n", + " '方面': 12243,\n", + " '面改': 112,\n", + " '革成': 66,\n", + " '本支': 41,\n", + " '支出': 597,\n", + " '出压': 49,\n", + " '力增': 183,\n", + " '增大': 254,\n", + " '大三': 114,\n", + " '三是': 740,\n", + " '是受': 202,\n", + " '受电': 56,\n", + " '电煤': 42,\n", + " '煤价': 63,\n", + " '格上': 389,\n", + " '上涨': 3910,\n", + " '涨火': 2,\n", + " '火电': 105,\n", + " '电上': 21,\n", + " '上网': 349,\n", + " '网价': 4,\n", + " '格下': 303,\n", + " '下调': 553,\n", + " '调以': 70,\n", + " '以及': 9147,\n", + " '及市': 132,\n", + " '场化': 629,\n", + " '化直': 12,\n", + " '直供': 22,\n", + " '供电': 362,\n", + " '电增': 15,\n", + " '增加': 5690,\n", + " '加等': 46,\n", + " '等因': 445,\n", + " '素影': 206,\n", + " '影响': 8739,\n", + " '响煤': 9,\n", + " '煤电': 202,\n", + " '电企': 117,\n", + " '益大': 31,\n", + " '幅下': 368,\n", + " '下滑': 753,\n", + " '滑亏': 2,\n", + " '损加': 5,\n", + " '剧她': 4,\n", + " '她表': 169,\n", + " '示下': 131,\n", + " '步国': 45,\n", + " '委和': 189,\n", + " '和央': 63,\n", + " '企将': 19,\n", + " '将进': 1183,\n", + " '步深': 219,\n", + " '深入': 3101,\n", + " '入开': 173,\n", + " '展降': 11,\n", + " '降本': 49,\n", + " '本增': 154,\n", + " '增效': 256,\n", + " '效工': 52,\n", + " '作加': 237,\n", + " '大处': 73,\n", + " '业的': 5831,\n", + " '的工': 3826,\n", + " '作力': 112,\n", + " '度做': 43,\n", + " '做好': 2147,\n", + " '好化': 9,\n", + " '作同': 150,\n", + " '时做': 116,\n", + " '好压': 4,\n", + " '压缩': 279,\n", + " '缩管': 3,\n", + " '理层': 169,\n", + " '层级': 180,\n", + " '级减': 6,\n", + " '少法': 22,\n", + " '法人': 892,\n", + " '人单': 246,\n", + " '单位': 4512,\n", + " '位工': 151,\n", + " '作深': 108,\n", + " '深化': 2679,\n", + " '化体': 325,\n", + " '体制': 1551,\n", + " '制机': 392,\n", + " '机制': 4775,\n", + " '制改': 849,\n", + " '革努': 6,\n", + " '努力': 3855,\n", + " '力保': 224,\n", + " '保持': 3758,\n", + " '持好': 21,\n", + " '好稳': 34,\n", + " '稳中': 415,\n", + " '中向': 320,\n", + " '向好': 605,\n", + " '好态': 58,\n", + " '态势': 1057,\n", + " '势n': 749,\n", + " 'n她': 313,\n", + " '时表': 1025,\n", + " '国企': 2157,\n", + " '革将': 104,\n", + " '将提': 311,\n", + " '提速': 333,\n", + " '速进': 69,\n", + " '步落': 76,\n", + " '落实': 3869,\n", + " '实中': 374,\n", + " '业功': 19,\n", + " '功能': 3237,\n", + " '能界': 5,\n", + " '界定': 124,\n", + " '定与': 161,\n", + " '与分': 85,\n", + " '分类': 1462,\n", + " '类方': 19,\n", + " '方案': 3761,\n", + " '案及': 44,\n", + " '及配': 103,\n", + " '配套': 1169,\n", + " '套措': 31,\n", + " '措施': 3970,\n", + " '施深': 25,\n", + " '入推': 490,\n", + " '推进': 8791,\n", + " '进公': 107,\n", + " '司制': 108,\n", + " '制股': 9,\n", + " '股份': 2670,\n", + " '份制': 130,\n", + " '制和': 398,\n", + " '和混': 42,\n", + " '混合': 1305,\n", + " '合所': 194,\n", + " '有制': 289,\n", + " '革加': 92,\n", + " '快完': 142,\n", + " '完善': 3326,\n", + " '善创': 22,\n", + " '创新': 12627,\n", + " ...})" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "two_gram_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Some More" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. 教学演示版本, 如果你想获得更好的结果,需要查阅更多资料,然后有很多小的点(stop words, smooth, OOV(out of vacabulary)); \n", + "2. 我们需要更多数据;\n", + "3. 数据也要保证高质量;" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n数学\\n\\n数学是利用符号语言研究數量、结构、变化以及空间等概念的一門学科,从某种角度看屬於形式科學的一種。數學透過抽象化和邏輯推理的使用,由計數、計算、量度和對物體形狀及運動的觀察而產生。數學家們拓展這些概念,為了公式化新的猜想以及從選定的公理及定義中建立起嚴謹推導出的定理。\\n\\n基礎數學的知識與運用總是個人與團體生活中不可或缺的一環。對數學基本概念的完善,早在古埃及、美索不達米亞及古印度內的古代數學文本便可觀見,而在古希臘那裡有更為嚴謹的處理。從那時開始,數學的發展便持續不斷地小幅進展,至16世紀的文藝復興時期,因为新的科學發現和數學革新兩者的交互,致使數學的加速发展,直至今日。数学并成为許多國家及地區的教育範疇中的一部分。\\n\\n今日,數學使用在不同的領域中,包括科學、工程、醫學和經濟學等。數學對這些領域的應用通常被稱為應用數學,有時亦會激起新的數學發現,並導致全新學科的發展,例如物理学的实质性发展中建立的某些理论激发数学家对于某些问题的不同角度的思考。數學家也研究純數學,就是數學本身的实质性內容,而不以任何實際應用為目標。雖然許多研究以純數學開始,但其过程中也發現許多應用之处。\\n\\n西方语言中“數學”()一詞源自於古希臘語的(),其有“學習”、“學問”、“科學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞(),意思為\"和學習有關的\"或\"用功的\",亦會被用來指\"數學的\"。其在英语中表面上的複數形式,及在法语中的表面複數形式\\',可溯至拉丁文的中性複數\\',由西塞罗譯自希臘文複數(),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。\\n\\n汉字表示的「數學」一詞大約产生于中国宋元時期。多指象數之學,但有時也含有今天上的數學意義,例如,秦九韶的《數學九章》(《永樂大典》記,即《數書九章》也被宋代周密所著的《癸辛雜識》記爲《數學大略》)、《數學通軌》(明代柯尚遷著)、《数学钥》(清代杜知耕著)、《數學拾遺》(清代丁取忠撰)。直到1939年,經過中國數學名詞審查委員會研究“算學”與“數學”兩詞的使用狀況後,確認以“數學”表示今天意義上的數學含義。\\n\\n數學有着久遠的歷史。它被認為起源於人類早期的生產活動:中國古代的六艺之一就有「數」,數學一詞在西方有希腊语詞源(mathematikós),意思是“学问的基础”,源于(máthema,“科学,知识,学问”)。\\n\\n史前的人類就已嘗試用自然的法則來衡量物質的多少、時間的長短等抽象的數量關係,比如时间单位有日、季節和年等。算術(加減乘除)也自然而然地產生了。古代的石碑及泥版亦證實了當時已有幾何的知識。\\n\\n更進一步則需要寫作或其他可記錄數字的系統,如符木或於印加帝國內用來儲存數據的奇普。歷史上曾有過許多不同的記數系統。\\n在最初有歷史記錄的時候,數學內的主要原理是為了做稅務和貿易等相關計算,為了解數字間的關係,為了測量土地,以及為了預測天文事件而形成的。这些需要可以简单地被概括为数学对數量、结构、空间及时间方面的研究。\\n\\n到了16世纪,算术、初等代数以及三角学等初等数学已大体完备。17世纪变量概念的产生使人们开始研究变化中的量与量的互相关系和图形间的互相变换,微积分的概念也在此時形成。随着數學轉向形式化,为研究数学基础而产生的集合论和数理逻辑等也开始发展。数学的重心从求解实际问题转变到对一般形式上的思考。\\n\\n從古至今,數學便一直不斷地延展,且與科學有豐富的相互作用,兩者的發展都受惠於彼此。在歷史上有著許多數學發現,並且直至今日都不斷地有新的發現。據Mikhail B. Sevryuk於2006年1月的期刊中所說,「存放於數學評論資料庫中論文和書籍的數量自1940年(數學評論的創刊年份)現已超過了一百九十萬份,而且每年還增加超過七萬五千份。此一學海的絕大部份為新的數學定理及其證明。」\\n\\n每當有涉及數量、結構、空間及變化等方面的困難問題時,通常就需要用到數學工具去解決問題,而這往往也拓展了數學的研究範疇。一開始,數學的運用可見於貿易、土地測量及之後的天文學。今日,所有的科學都存在著值得數學家研究的問題,且數學本身亦給出了許多的問題。牛頓和莱布尼兹是微積分的發明者,費曼發明了費曼路徑積分,這是推理及物理洞察二者的產物,而今日的弦理論亦引申出新的數學。一些數學只和生成它的領域有關,且用來解答此領域的更多問題。但一般被一領域生成的數學在其他許多領域內也十分有用,且可以成為一般的數學概念。即使是「最純的」數學通常亦有實際的用途,此一非比尋常的事實,被1963年諾貝爾物理獎得主維格納稱為「數學在自然科學中不可想像的有效性」。\\n\\n如同大多數的研究領域,科學知識的爆發導致了數學的專業化。主要的分歧為純數學和應用數學。在應用數學內,又被分成兩大領域,並且變成了它們自身的學科——統計學和電腦科學。\\n\\n許多數學家談論數學的\"優美\",其內在的美學及美。「簡單」和「一般化」即為美的一種。另外亦包括巧妙的證明,如歐幾里得對存在無限多質數的證明;又或者是加快計算的數值方法,如快速傅立葉變換。高德菲·哈羅德·哈代在《一個數學家的自白》一书中表明他相信單單是美學上的意義,就已經足夠作為純數學研究的正當理由。\\n\\n我們現今所使用的大部分數學符號在16世紀後才被發明出來的。在此之前,數學以文字的形式書寫出來,這種形式會限制了數學的發展。現今的符號使得數學對於專家而言更容易掌握,但初學者卻常對此望而却步。它被極度的壓縮:少量的符號包含著大量的訊息。如同音樂符號一般,現今的數學符號有明確的語法,並且有效地對訊息作編碼,這是其他書寫方式難以做到的。符号化和形式化使得数学迅速发展,并帮助各个科学领域建立基础支撑理论。\\n\\n數學語言亦對初學者而言感到困難。如“或”和“只”這些字有著比日常用語更精確的意思。亦困惱著初學者的,如“開放”和“域”等字在數學裡有著特別的意思。數學術語亦包括如“同胚”及“可積性”等專有名詞。但使用這些特別符號和專有術語是有其原因的:數學需要比日常用語更多的精確性。數學家將此對語言及邏輯精確性的要求稱為「嚴謹」。但在现实应用中,舍弃一些严谨性往往会得到更好的结果。\\n\\n嚴謹是數學證明中很重要且基本的一部份。數學家希望他們的定理以系統化的推理依著公理被推論下去。這是為了避免依著不可靠的直觀而推出錯誤的「定理」,而這情形在歷史上曾出現過許多的例子。在數學中被期許的嚴謹程度因著時間而不同:希臘人期許著仔細的論證,但在牛頓的時代,所使用的方法則較不嚴謹。牛頓為了解決問題所做的定義,到了十九世紀才重新以小心的分析及正式的證明來處理。今日,數學家們則持續地在爭論電腦輔助證明的嚴謹度。當大量的計算難以被驗證時,其證明亦很難說是足夠地嚴謹。\\n\\n公理在傳統的思想中是「不證自明的真理」,但這種想法是有問題的。在形式上,公理只是一串符號,其只對可以由公理系統導出的公式之內容有意義。希爾伯特計劃即是想將所有的數學放在堅固的公理基礎上,但依據哥德爾不完備定理,每一相容且能蘊涵皮亞諾公理的公理系統必含有一不可決定的公式;因而所有數學的最終公理化是不可能的。儘管如此,數學常常被想像成只是某種公理化的集合論,在此意義下,所有數學敘述或證明都可以寫成集合論的公式。\\n\\n卡爾·弗里德里希·高斯稱數學為「科學的皇后」。在拉丁原文\\',以及其德語\\'中,對應於「科學」的單字的意思皆為知識(領域)。而實際上,science一詞在英語內本來就是這個意思,且無疑問地數學在此意義下確實是一門「科學」。將科學限定在自然科學則是在此之後的事。若認為科学是只指物理的世界時,則數學,或至少是純數學,不會是一門科學。愛因斯坦曾如此描述:「數學定律越和現實有關,它們越不確定;若它們越是確定的話,它們和現實越不會有關。」\\n\\n許多哲學家相信數學在經驗上不具可否證性,且因此不是卡爾·波普爾所定義的科学。但在1930年代時,在數理邏輯上的重大進展顯示數學不能歸併至邏輯內,且波普爾推斷「大部份的數學定律,如物理及生物學一樣,是假設演繹的:純數學因此變得更接近其假設為猜測的自然科學,比它現在看起來更接近。」然而,其他的思想家,如較著名的拉卡托斯,便提供了一個關於數學本身的可否證性版本。\\n\\n另一觀點則為某些科學領域(如理論物理)是其公理為嘗試著符合現實的數學。而事實上,理論物理學家齊曼(John Ziman)即認為科學是一種公眾知識,因此亦包含著數學。在任何的情況下,數學和物理科學的許多領域都有著很多相同的地方,尤其是從假設所得的邏輯推論之探索。直覺和實驗在數學和科學的猜想建構上皆扮演著重要的角色。實驗數學在數學中的重要性正持續地在增加,且計算和模擬在科學及數學中所扮演的角色也越來越加重,減輕了數學不使用科學方法的缺點。在史蒂芬·沃爾夫勒姆2002年的著作《一種新科學》中他提出,計算數學應被視為其自身的一科學領域來探索。\\n\\n數學家對此的態度並不一致。一些研究應用數學的數學家覺得他們是科學家,而那些研究純數學的數學家則時常覺得他們是在一門較接近邏輯的領域內工作,且因此基本上是個哲學家。許多數學家認為稱他們的工作是一種科學,是低估了其美學方面的重要性,以及其做為七大博雅教育之一的歷史;另外亦有人認為若忽略其與科學之間的關聯,是假裝沒看到數學和其在科學與工程之間的交互影響,進而促進了數學在許多科學上的發展此一事實。這兩種觀點之間的差異在哲學上產生了數學是「被創造」(如藝術)或是「被發現」(如科學)的爭議。大学院系划分中常见「科学和数学系」,这指出了这两个领域被看作有緊密聯繫而非一樣。實際上,數學家通常會在大體上與科學家合作,但在細節上卻會分開。此爭議亦是數學哲學眾多議題的其中一個。\\n\\n如上所述,數學主要的學科最先產生於商業上計算的需要、了解數字間的關係、測量土地及預測天文事件。這四種需要大致地與數量、結構、空間及變化(即算術、代數、幾何及分析)等數學上廣泛的子領域相關連著。除了上述主要的關注之外,亦有用來探索由數學核心至其他領域上之間的連結的子領域:至邏輯、至集合論(基礎)、至不同科學的經驗上的數學(應用數學)、及較近代的至不確定性的嚴格研究。\\n為了闡明數學基礎,數學邏輯和集合論等領域被發展了出來。\\n\\n數學邏輯專注於將數學置在一堅固的公理架構上,並研究此一架構的結果。就數學邏輯本身而言,其為哥德爾第二不完備定理所屬的領域,而這或許是邏輯中最廣為流傳的成果-總存在一不能被證明而又為真的定理。\\n\\n現代邏輯被分成遞歸論、模型論和證明論,且和理論電腦科學有著密切的關連性,千禧年大獎難題中的P/NP問題就是理論電腦科學中的著名問題。\\n\\n數量的研究起於數,一開始為熟悉的自然數及整數與被描述在算術內的自然數及整數的算術運算。整數更深的性質於數論中有詳細的研究,此一理論包括了如費馬最後定理等著名的結果。數論還包括兩個被廣為探討的未解問題:孿生質數猜想及哥德巴赫猜想。\\n\\n當數系更進一步發展時,整數被視為有理數的子集,而有理數則包含於實數中,連續的量即是以實數來表示的。實數則可以被進一步廣義化成複數。數的進一步廣義化可以持續至包含四元數及八元數。從自然數亦可以推廣到超限數,它形式化了計數至無限的這一概念。另一個研究的領域為大小,這個導致了基數和之後對無限的另外一種概念:阿列夫数,它允許無限集合之間的大小可以做有意義的比較。\\n\\n許多如數及函數的集合等數學物件都有著內含的結構。這些物件的結構性質被探討於群、環、-{zh-cn:域;zh-tw:體}-等抽象系統中,該些物件事實上也就是這樣的系統。此為代數的領域。在此有一個很重要的概念,即廣義化至向量空間的向量,它於線性代數中被研究。向量的研究結合了數學的三個基本領域:數量、結構及空間。向量分析則將其擴展至第四個基本的領域內,即變化。\\n\\n创立于二十世纪三十年代的法国的布尔巴基学派认为:纯粹数学,是研究抽象结构的理论。\\n结构,就是以初始概念和公理出发的演绎系统。\\n布尔巴基学派认为,有三种基本的抽象结构:代数结构(群,环,域……),序结构(偏序,全序……),拓扑结构(邻域,极限,连通性,维数……)。\\n\\n空間的研究源自於幾何-尤其是欧几里得几何。三角學則結合了空間及數,且包含有著名的勾股定理。現今對空間的研究更推廣到了更高維的幾何、非歐幾里得幾何(其在廣義相對論中扮演著核心的角色)及拓撲學。數和空間在解析幾何、微分幾何和代數幾何中都有著很重要的角色。在微分幾何中有著纖維叢及流形上的微積分等概念。在代數幾何中有著如多項式方程的解集等幾何物件的描述,結合了數和空間的概念;亦有著拓撲群的研究,結合了結構與空間。李群被用來研究空間、結構及變化。在其許多分支中,拓撲學可能是二十世紀數學中有著最大進展的領域,並包含有存在已久的龐加萊猜想,以及有爭議的四色定理。龐加萊猜想已在2006年确认由俄罗斯数学家格里戈里·佩雷尔曼證明,而四色定理已在1976年由凱尼斯·阿佩爾和沃夫岡·哈肯用電腦證明,而從來沒有由人力來驗證過。\\n\\n了解及描述變化在自然科學裡是一普遍的議題,而微積分更為研究變化的有利工具。函數诞生於此,做為描述一變化的量的核心概念。對於實數及實變函數的嚴格研究為實分析,而複分析則為複數的等價領域。黎曼猜想-數學最基本的未決問題之一-便是以複分析來描述的。泛函分析注重在函數的(一般為無限維)空間上。泛函分析的眾多應用之一為量子力學。許多的問題很自然地會導出一個量與其變化率之間的關係,而這在微分方程中被研究。在自然界中的許多現象可以被動力系統所描述;混沌理論則是對系統的既不可預測而又是決定的行為作明確的描述。\\n離散數學是指對理論電腦科學最有用處的數學領域之總稱,這包含有可計算理論、計算複雜性理論及資訊理論。可計算理論檢驗電腦的不同理論模型之極限,這包含現知最有力的模型-圖靈機。複雜性理論研究可以由電腦做為較易處理的程度;有些問題即使理論是可以以電腦解出來,但卻因為會花費太多的時間或空間而使得其解答仍然不為實際上可行的,儘管電腦硬體的快速進步。最後,資訊理論專注在可以儲存在特定媒介內的資料總量,且因此有壓縮及熵等概念。\\n\\n作為一相對較新的領域,離散數學有許多基本的未解問題。其中最有名的為P/NP問題-千禧年大獎難題之一。一般相信此問題的解答是否定的。\\n\\n應用數學思考將抽象的數學工具運用在解答科學、工商業及其他領域上之現實問題。應用數學中的一重要領域為統計學,它利用機率論為其工具並允許對含有機會成分的現象進行描述、分析與預測。大部份的實驗、調查及觀察研究需要統計對其資料的分析。(許多的統計學家並不認為他們是數學家,而比較覺得是合作團體的一份子。)數值分析研究有什麼計算方法,可以有效地解決那些人力所限而算不出的數學問題;它亦包含了對計算中捨入誤差或其他來源的誤差之研究。\\n\\n數學獎通常和其他科學的獎項分開。數學上最有名的獎為菲爾茲獎,創立於1936年,每四年頒獎一次。它通常被認為是數學的諾貝爾獎。另一個國際上主要的獎項為阿貝爾獎,創立於2003年。兩者都頒獎於特定的工作主題,包括數學新領域的創新或已成熟領域中未解決問題的解答。著名的23個問題,稱為希爾伯特的23個問題,於1900年由德國數學家大衛·希爾伯特所提出。這一連串的問題在數學家之間有著極高的名望,且至少有九個問題已經被解答了出來。另一新的七個重要問題,稱為千禧年大獎難題,發表於2000年。對其每一個問題的解答都有著一百萬美元的獎金,而當中只有一個問題(黎曼猜想)和希爾伯特的問題重複。\\n\\n\\n\\n\\n\\n\\n\\n哲学\\n\\n哲學()是研究普遍的、根本的问题的学科,包括存在、知识、价值、理智、心灵、语言等领域。哲学与其他学科的不同是其批判的方式、通常是系统化的方法,并以理性论证為基礎。在日常用语中,其也可被引申为个人或团体的最基本信仰、概念或态度。\\n\\n英語詞語()源于古希臘語中的,意思為「愛智慧」,有时也译为「智慧的朋友」,该词由(philos,爱)的派生词(Philein,去爱)和(Sophia,智慧)组合而成。一般认为,古希腊思想家毕达哥拉斯最先在著作中引入“哲学家”和“哲学”这两个术语。\\n\\n“哲”一词在中国起源很早,如“孔门十哲”,“古圣先哲”等词,“哲”或“哲人”,专指那些善于思辨,学问精深者,即西方近世“哲学家”,“思想家”之谓。在《易經》當中已經開始討論哲學問題,形而上学的中文名稱取自《易經·繫辭上傳》「形而上者谓之道,形而下者谓之器」一語。1874年,日本啟蒙家西周,在《百一新論》中首先用漢文「哲學」來翻譯\"philosophy\"一詞。\\n\\n英国哲学家罗素对哲学的定义是:\\n\\n胡適在《中国哲学史大纲》中称「凡研究人生切要的问题,从根本上着想,要寻一个根本的解决:这种学问叫做哲学」。\\n\\n雖然哲學源自西方的傳統,但許多文明在歷史上都存在著一些相似的論題。東亞和南亞的哲學被稱之為東方哲學,而北非和中東則因為其和歐洲密切的互動,因此常被視為是西方哲學的一部份。\\n\\n對哲學的主題亦存在許多看法。一些人認為哲學是對問題本身過程的審查;另外一些人則認為實質上存在著哲學必須去回答的哲學命題。\\n\\n\\n古希臘哲學家透過問問題來進行哲學實踐,他們所提的問題大概可以歸類為三類,這三類問題分別形成了哲學的基礎學科——分别是形而上学、伦理学、认识论(或知识论) 。\\n\\n有意思的是,现代哲学上蒙现出\"不要求精确理由\"的哲学论调,如\"本质技巧\"(认定本质不可知),这种现象将不可知论(世界上终究有人不能理解的存在)的重要性提高了。\\n\\n哲學可以分为很多不同的分支,主要包括形而上學、知識論、倫理學、邏輯學和美學。\\n\\n\\n很多人类社群思考过哲学问题并且互相学习建立了各种哲学流派。\\n\\n东方哲学是通过每个地区的历史时期来组织的。西方哲学一般可以分为三个或更多时期,最重要的是古典哲学、中世纪哲学和近代哲学。\\n\\n印度哲學的歷史源遠流長,早在吠陀時代已經開始,至公元前6世紀為全盛時期。當時古印度的思想界百花齊放,其中最著名的包括佛教創始人釋迦牟尼佛、耆那教創始人笩駄摩那、阿耆多·翅舍欽婆羅、波拘陀·迦旃延、富蘭那·迦葉、數論派等。\\n\\n中國哲學的主要部分起源東周時期,当时以诸子百家广为人知,以孔子的儒家、老子的道家、墨子的墨家及晚期的法家為代表,还有一些流派例如农家、阴阳家和名家在之后则名声不显。在秦朝焚书坑儒后除了法家、儒家、道家外其他流派都不再活跃。在當代,中國哲學仍然在亞洲文化扮演一定作用,但是學理上仍在爭辯中國哲學是否應歸為哲學。\\n\\n古希腊-哲学是西方哲学的一个时期,时间为公元前6世纪[约585]到公元6世纪。它一般被分为三个时期:前苏格拉底时期、柏拉图和亚里士多德的古典希腊时期、和后亚里士多德(或希腊化)时期:有时候会把新柏拉图主义和基督教哲学家们的古典时代晚期加入作为第四个时期。\\n\\n在公元前6世纪的希腊,西方哲学就从古代神话和诗歌中脱颖而出,逐步开始对宇宙的组成以及本源的思考而开始了独立发展。前苏格拉底时期的自然派哲學家们多关注自然界,被認為是西方最早的哲學家,不管他們認識以及解释世界的方式是否正確,但是他們的想法之所以有別於迷信的原因在於,這些哲學家是以理性輔佐證據的方式歸納出自然界的现象。诸如:\\n\\n\\n公元前5世纪中期,普罗泰戈拉和高尔吉亚等所形成的辯士學派将研究的重点由自然转移到人类本身。认为“人才是万物之本”。他们都不相信有真正的存在和真理。普罗泰戈拉认为是非善恶都是相对于人的感觉而言,而高尔吉亚却认为所有的都是同样的假,这是怀疑论的雏形。 \\n\\n公元前6世纪末,以毕达哥拉斯为主的毕达哥拉斯学派所主张的哲学与前述的观点既相近又有不同。罗马古代的历史上记载毕达哥拉斯第一个称自己为哲学家,或者说是爱智慧。他认为“一切都是数字”。其意思就是说一切事物的实质和结构都是它们所包含的数字关系所决定的。他称平均、秩序和调和是宇宙的三大基调,并以音乐的调和说明宇宙的调和。他所在的学派将宇宙总结为十种性质相异的组合:有限与无限、奇与偶、一与多、左与右、男与女、静与动、直与曲、光明与黑暗、善与恶、方与圆。至此之后,数学的本质及其地位,一直都是哲学的主要问题之一,数学不受观察和实验造成的不确定性影响,而且是通过纯粹的思想加以理解的。\\n\\n其中关于变与不变的关系的争论,真实世界与直觉世界的差别,真理与意见的矛盾,导致产生了认识论的问题。\\n\\n在古典希腊时期西方哲学方法的关键特质被建立:依靠诉诸理性和论证,通过一种批判性的方法来接受或建立观点。这包括苏格拉底被称为蘇格拉底反詰法或“反驳论证”方法的辩证法,他主要用其来检验例如善良和公平正義的关键道德概念。这种方法将一个问题分解成一系列的疑问,在对疑问的回答中逐步提取想要找到的答案,其极大影响可以从现在使用的科学方法中看出,在科学方法中假说是第一个阶段。\\n\\n苏格拉底没有直接教过人,但之后的柏拉图深受其影响。而其整个哲学思想来源于两大理论:其一,永远不要做坏事;其二,一个内心真正善良且正义的人绝不会做相反之事。他认为真理有其客观性,试图推翻智者们以个人主观感觉为真理的思想。然后提出德的概念,以作为人生行事的方向。对于道德是什么的问题,苏格拉底的回复为“知识即道德。”对于知识是何物的问题,他回答说知识是透过理性而得的概念。苏格拉底开创了认识论和伦理学,如此奠定了他的哲学地位。\\n\\n古典希臘時期的的哲学家中柏拉图和亚里士多德对后世的影响力最大,特别是柏拉图被认为是西方哲学的创始人。哲学家阿爾弗雷德·諾思·懷特黑德评价柏拉图:“欧洲哲学传统最被普遍公认的特点,就是它包含了一系列对柏拉图的注脚。我的意思不是怀疑学者们系统体系的思想是提取自柏拉图的著作。我暗示的是那些他们散落的一般思想的财富。”換言之即使數千年後,人們依舊在試著回答他所提出的問題,這也代表著人們依然為這些問題或是這些問題所延伸的更多問題而感到困惑。\\n\\n毕达哥拉斯的思想对柏拉图产生了显著地影响,并通过柏拉图影响了整个西方哲学。柏拉图和亚里士多德作为最早的古典希腊哲学家批判地引用了其它的一些”智者“,当时这些人在希腊被称为“辩士”并在毕达哥拉斯之前相当普遍。从他们的批判看来,在他们的古典时代一个在更高尚地、纯粹地”爱智慧”(真的哲学家)与那些更早更普遍的旅行教师——经常也通过自己的技艺来赚钱——之间的分水岭之后被建立。\\n\\n亚里士多德死后,整个哲学界陷入了独立时期,称为时期。因为整个社会和政治陷入混乱。这段时期产生了斯多葛学派和伊壁鸠鲁学派,以及怀疑主义派、新柏拉图派和。这些学派的共同特点是伦理化。斯多葛学派主要是顺应自然和自制。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有宗教主义的哲学,并逐渐产生融化基督教和希腊哲学于一体的理论,即为后来的基督教哲学。\\n\\n直到公元529年,罗马皇帝查士丁尼一世尼命令关闭雅典的柏拉图学院。称一些余下的学院成员逃入了萨珊王朝首都泰西封。\\n\\n印度哲學是指起源於印度次大陸的哲學思想,包括、、等,這些印度哲學具有一些共同且複雜的起源,都有有關佛法及業的主題,而且都希望達到個人的解放。這些哲學約在西元前一世紀到西元幾世紀的時間成形。\\n\\n中世纪哲学指的是西欧和中东在中世纪的哲学体系,其时间范围没有定论,大致上是从基督化的罗马帝国时期至文艺复兴时期。中世纪哲学被部分定义为对古典希腊和希腊化哲学的再发现和进一步发展,另一部分是需要解决神学问题并把亞伯拉罕諸教(伊斯兰教、犹太教和基督教)的教条同世俗知识一同整合并推广。\\n\\n文艺复兴人文学者们排斥中世纪时期,把它当作在希腊罗马的古典时代与古典文化“复兴”之间的一个“过渡”的野蛮时期。然而在中世纪这将近一千年中哲学在欧洲仍取得了长足地发展。认为\"在强度、复杂度还有成就上,可以确信地说哲学在十三世纪的兴盛能与公元前四世纪古希腊哲学的黄金时期媲美。\"\\n\\n这个时代讨论的问题有信仰和理智的关系,神的存在与统一,神学话题和形而上学,关于知识、宇宙和个人的问题。\\n\\n中世纪的哲学家包括基督教学者希波的奥古斯丁、波爱修斯、安瑟倫、、皮埃尔·阿伯拉尔、罗吉尔·培根、圣文德、托马斯·阿奎那、邓斯·司各脱、奥卡姆的威廉和让·布里丹等;犹太哲学家迈蒙尼德和;还有穆斯林哲学家肯迪、法拉比、海什木、伊本·西那、安薩里、伊本·巴哲、、伊本·赫勒敦和伊本·魯世德等。中世纪的经院哲学传统一直到17世纪仍在活跃,例如和等人物。其中托马斯主义之父阿奎那极大地影响了整个天主教欧洲,他特别强调理性和论证,是最先开始使用亚里士多德形而上学和知识论的著作的新译本的学者之一。他的工作明显远离了统治大部分早期经院哲学的新柏拉图主义和奥古斯丁的思想。\\n\\n从文艺复兴开始,人们的思想开始从清净的僧院走出,来到喧嚣的尘世。从而发展自然,也发展人类自身。从而形成人文主义和自然哲学两股既有联系又有区别的思潮。\\n\\n\"文艺复兴\"是对中世纪到近代之间过渡时期的通称,那时对古典文献的重新学习帮助把哲学界的兴趣从对逻辑学、形而上学和神学领域的钻研转移到包括道德、语言学和神秘主义的更加广泛的研究。对经典和人文艺术例如历史学和文学的研究在基督教世界学术界中享有前所未有的兴趣,这个趋势被称为人文主义,它受到柏拉图主义、希腊怀疑主义和罗马斯多葛主义的影响。人文主义者的哲学兴趣跟随彼特拉克转移到造物主与其美德上,替代了中世纪时对形而上学和逻辑学的兴趣。\\n\\n那时对古典哲学的研究出现了两种新方式。一方面对亚里士多德的研究因为的影响而产生了变化。阿威罗伊亚里士多德主义者和更正统的天主教亚里士多德主义者譬如艾爾伯圖斯·麥格努斯和托马斯·阿奎那之间的分歧最终在文艺复兴发展出一种“人文亚里斯多德哲学”,譬如和的思想。另一方面,在一些之前不为西欧所熟知的作品的重发现的帮助下,对柏拉图和新柏拉图主义的研究作为另一个选择变得普遍起来。著名的文艺复兴时期的柏拉图主义者包括库萨的尼古拉,还有之后的马尔西利奥·费奇诺和若望·皮科·德拉·米蘭多拉。\\n\\n文艺复兴也重新产生了对反亚里士多德的把自然看作一个有机的、活生生的整体而不取决于神学的理论的兴趣,例如在库萨的尼古拉、尼古拉·哥白尼、焦爾達諾·布魯諾、还有托马索·康帕内拉的著作中。在自然哲学中这样的运动与对神秘主义、魔法、赫尔墨斯主义还有占星学等兴趣重燃相契合,它们被认为隐藏着收获知识和掌控自然方法的大门。\\n\\n这些新的哲学运动伴随着欧洲宗教和政治的剧变同时出现:宗教改革和封建制的衰落。虽然参与宗教改革的神学家们对哲学没有直接的兴趣,他们打破了神学和知识权威的传统基础。同时还伴随着信仰主义和怀疑主义的复兴,体现在伊拉斯谟,蒙泰涅和等思想家身上。同时,民族国家政治上逐步的中央集权的过程得到了世俗政治哲学的响应,如尼可罗·马基亚维利(常被描述为第一个现代政治思想家,或者是现代政治思想形成的关键点)、托马斯·莫尔、伊拉斯谟、尤斯图斯·利普修斯、让·博丹和胡果·格老秀斯等的著作。\\n\\n先秦諸子之后的兩漢經學、魏晉玄學等都是中國哲學的一部份,自唐朝起佛教也开始对哲学产生重要影响;不过中世纪中国哲学最主要的部分是宋明理學的发展。\\n\\n宋明理学反对汉代后开始影响儒学的道教和佛教中的迷信和神秘的元素,是一股倡导更加理性和世俗化儒学的哲学运动。尽管理学遭到道教和佛教徒的批评,理学仍借鉴了它们两个的部分术语和概念。然而和佛教和道教把形而上学看作心灵发展、宗教启示的催化剂并且是不朽的不同,宋明理学把形而上学当作建立一个理性的伦理体系的指导。宋明理学的起源可以追溯到唐朝:韩愈和李翱被视为宋代理学的先驱。宋代理学家周敦颐以道教形而上学理论为框架建立了他的伦理哲学体系,他被看作是宋明理学的创始人。\\n\\n在东亚的其他地方,日本哲学形成于本土的神道信仰和佛教、儒家以及另一些中国哲学和印度哲学学派混合发展。与日本类似,在中巫教的情绪化内容被混合到了从中国传入的理学当中。\\n\\n主條目:近代哲學\\n\\n西方哲学史上的近代早期一般指17世纪和18世纪,其中18世纪常被称为启蒙时代。现代哲学不同于其前身,它和传统权威例如教会、学院、亚里士多德的关系更加独立,出现了对知识基础和形而上学体系建设的新兴趣;和摆脱了自然哲学的近代物理学的出现。从17世纪开始,近代哲学就以认识论为研究重点。由于经验论(经验主义)与唯理论(理性主义)的争论,使物质与精神的关系作为认识论的首要问题突显出来。\\n当时其他的哲学焦点包括精神的天性和其与身体的关系,新的自然科学对诸如自由意志和神的传统上属于神学的话题的影响,和伦理学和政治哲学的世俗基础\\n。这种潮流最早被鲜明地体现在弗兰西斯·培根的被称为用来扩展知识的新的、经验主义的程序,并很快在笛卡儿的机械主义物理学和理性主义的形而上学中建立了具有巨大影响力的形式。培根运用归纳法,第一个提出思维的主体“人”应该主动干涉自然来为人服务。\\n\\n近现代政治哲学的鼻祖托马斯·霍布斯最早将这套方法论系统得应用在政治哲学上,包括\"社會契約\"的近代理论。早期近代哲学的学术经典一般包括笛卡尔、斯賓諾莎、莱布尼茨、洛克、贝克莱、休谟和康德。同时其的其他思想家也对哲学做出了贡献,例如伽利略、皮埃尔·伽桑狄、布莱兹·帕斯卡、马勒伯朗士、艾萨克·牛顿、、孟德斯鸠、、托马斯·里德、让·勒朗·达朗贝尔和亚当·斯密,而让-雅克·卢梭是反启蒙运动的开创性人物。早期近代哲学的大致结束通常被确定为伊曼努尔·康德的试图限定形而上学范围、证明科学知识并用道德和自由来调和两者的体系。\\n\\n理性主义者中勒内·笛卡儿认为物质世界是由数学关系组成的单一体系,他企图将物理学转化为数学。他在其著作中,对整个经院哲学以及在他那个时代流行的教育与哲学体系加以讽刺。其认为“我思故我在”是认识论的无可怀疑之出发点。笛卡尔是割裂精神和物质的二元论者,为了厘清二者关系,他坚定认为在上帝那里,精神和物质是统一的。其理论被称为笛卡尔主义\\n。斯宾诺莎是笛卡尔之后,又一位著名的唯理论者。他的认识论、几何学和机械观都来自于笛卡尔。但他不认同笛卡尔的二元论,认为精神和物质不过是唯一实体的两种属性\\n。莱布尼茨作为唯理论者坚定地维护笛卡尔的学说并反驳约翰·洛克的理论。与笛卡尔不同的是,他认为万物的实体是“单子”,且互相没有关系,而是由于“前定和谐”才共存一体,即存在于神之中。“前定和谐”调和了笛卡尔之二元论和斯宾诺莎之实体双重性。\\n洛克发展了经验论,他不认同笛卡尔的“天赋观念”,提出白板说,他强调人们从感觉中抽象出普遍的概念,认为感觉中的个别东西才是第一位的。不过他基本认同笛卡尔的二元论。贝克莱发展了洛克的哲学理论,提出了“存在就是被感知”。他认为除了感知的主题和被感知的知觉之外,什么也没有。他非常不赞同物质的抽象概念,认为其既无客观实在,也不能存在于人心。大卫·休谟的理论比贝克莱的更进一步,他不仅仅认为物质实体不存在,更认为精神实体不存在。只承认知觉的存在。他还以自己的不可知论和怀疑论认为不存在统一性和普遍性的东西,认定多样性和个别性才是最高原理。\\n\\n经验论与唯理论的争论也包含了唯物主义与唯心主义的争论。在18世纪时,法国的拉美特利公开宣布唯物主义是唯一的,而百科全书的主编德尼·狄德罗也拒绝承认神的存在。\\n\\n另外伏尔泰,孟德斯鸠和其他百科全书派的学者都有涉及政治和伦理领域。他们都认为机械主义才是最终形式——物质是唯一的且处于永恒运动的,精神只是人脑的属性。因此他们认为无机物与有机物不可逾越,人的思维是人感官的结果。不过他们仍然是经验主义者,在因果性上,他们认为只有必然性才是唯一的,这就成为唯心主义的观念。\\n\\n从18世纪中后期开始,直到19世纪初,哲学便进入了近代哲学的总结时期,这就是德国古典哲学时期。有两条线索标志着转折的到来:一、思维与存在的关系更加明确;二、产生了系统辩证法。其代表人物有I.康德、J.G.费希特、F.W.谢林、G.W.F.黑格尔等。\\n\\n康德给哲学带来了三个标志性的创造:\\n\\n\\n他受到休谟的诸多影响,并为西方哲学带来一次革命。他认为哲学的研究核心就是规定理性能做什么以及不能做什么。\\n\\n康德同意休谟的理论并认为,存在一些原则,使得心灵对经验和认识加以组织,而证据皆可以在数学中找到。即是,包含在命题里的要比包含在原是概念的定义要多得多。他使用称之为批判哲学的先验方法,来展现经验的某些范畴和形式都必然地被预先存在于人们一切言谈之中。\\n\\n凭借着他的三部“批判性”的著作,为先验方法作出相应的结构:\\n\\n\\n他还为道德哲学奠定了新基础,且他赋予了自由概念的新意义。因为其影响在现代依旧尚存,其理论被人们称为康德主义。\\n费希特本来承认斯宾诺莎的机械的因果决定论,但后来受到康德的影响,开始认为因果决定论只是表面,其实质为自我不是必然性的奴仆而是独立自由的主体。就此,他建立了主观的思维与客观的存在之统一说。\\n\\n谢林是从费希特理论出发的,但深受斯宾诺莎和文学上浪漫主义的影响,创立了自己的学说。即他认为自然和精神、存在和思维,客体和主体,表面相反,实则统一,是同一个“绝对”的不同发展阶段,这个“绝对”即是万事万物的根源。他认为艺术才是最直观的理性。\\n黑格尔及其理论的出现将西方哲学的推上一个新高度,他创立了西方哲学史上最庞大的客观唯心主义体系,并系统地阐述了辩证法。他的理论和学说对近现代哲学产生了很深远的影响,并被称为黑格尔主义。\\n\\n从黑格尔的思想体系中发展而成的多种哲学运动。其重点就是以历史和逻辑为主,历史方面,它从不同角度理解“凡是合理的就是现实的”;逻辑方面,它有发现其中所说的“真理即整体”。\\n\\n黑格尔认为哲学的重点是放弃分裂,达到统一。他把以前的时代说成是思维与存在、理想与现实分裂,自由与必然,个人与社会、无限与有限、统一性与多样性分裂之时代。\\n\\n他从康德的“心灵的合理性以及在经验中的积极作用”的概念出发,但反对康德的“超越经验世界和‘物自身’的世界”,并认为心灵和世界一样具有相同基础理性结构。他所认为的普遍性不是脱离特殊的抽象普遍,而是包含特殊在内之普遍,即为具体普遍;他所认为的统一也非脱离矛盾、对立的抽象统一,而是包含它们在内的统一,即为对立统一。上述综合在一起即是他的理论:最真实的无所不包的整体即是“绝对精神”,又是对立的统一。\\n\\n他认为,为了达到这个“绝对精神”,需要经过三个阶段,从逻辑、自然到精神,即是从思维到存在,再到两者统一的过程,从而完成他的统一论。\\n\\n就此,社会和历史的现象,便被赋予一种在哲学史上还是崭新的显赫地位。他还将伦理学划归到这个领域,从而在伦理学理论和对思想的理解中提出重要的路线。\\n\\n从19世纪中叶开始,西方哲学就进入现代哲学阶段。因为在19世纪中期,欧洲的工业革命几近完成。\\n\\n现代哲学,特别是19世纪中后期的哲学流派,有叔本华的意志主义,新康德主义,新黑格尔主义,马克思主义。然而此時的哲學與後來的存在主義、現象學等在當代一般歸為「歐陸哲學」,與二十世紀以後著重嚴謹邏輯與語詞分析所發展出的「分析哲學」成為風格迥異的兩大西方哲學典範。\\n\\n20世纪的西方哲学上主流有两条:\\n\\n现代哲学主要包含以下几种潮流。\\n历程哲学:\\n\\n主流马克思主义:\\n\\n西方马克思主义:\\n\\n革新的黑格尔主义:\\n\\n结构主义:\\n\\n分析哲学:\\n\\n实证主义:\\n\\n新康德主义:\\n\\n逻辑实证主义:\\n\\n语言哲学:\\n\\n现象学:\\n\\n唯物论:\\n\\n新托马斯主义:\\n\\n科學哲學:\\n\\n意志主义:\\n\\n实用主义:\\n\\n存在主义:\\n\\n解释学:\\n\\n唯心主義的各种变体在18世紀晚期至20世紀早期的哲学界相当流行。康德主张的先验唯心主义认为人们对事物的理解是有界限的,因为在客观判断条件下很多事情是办不到的。他在1781年發行的作品《纯粹理性批判》試圖調和18世紀兩大主要的哲學派別:經驗主義和理性主義,并且建立一个研究形而上学的新基础。\\n\\n德國唯心主義最著名的作品是黑格尔于1807年出版的《精神现象学》。黑格尔承认自己的理念不是新的,不过他的目标是完成之前的哲学家们的不完整的体系。黑格尔认为哲学的重点是放弃分裂,达到统一。他把以前的时代说成是思维与存在、理想与现实分裂,自由与必然、个人与社会、无限与有限、统一性与多样性分裂之时代。他从康德的“心灵的合理性以及在经验中的积极作用”的概念出发,但反对康德的“超越经验世界和‘物自身’的世界”,并认为心灵和世界一样具有相同基础理性结构。他所认为的普遍性不是脱离特殊的抽象普遍,而是包含特殊在内之普遍,即为具体普遍;他所认为的统一也非脱离矛盾、对立的抽象统一,而是包含它们在内的统一,即为对立统一。上述综合在一起即是他的理论:最真实的无所不包的整体即是“绝对精神”,又是对立的统一。黑格尔认为需要经过三个阶段来达到这个“绝对精神”,从逻辑、自然到精神,即是从思维到存在,再到两者统一的过程,从而完成他的统一论。他还将伦理学划归到这个领域,从而在伦理学理论和对思想的理解中提出重要的路线。\\n马克思主义哲学是马克思和恩格斯建立的以辩证唯物主义为核心的哲学体系。其认为实践是检验哲学之真理性的最终标准,哲学应伴随着社会、科学技术和文化的发展而不断发展。其主要思想体系在19世纪70年代主要由恩格斯创立,20世纪20年代在苏联形成完整体系——辩证唯物主义和历史唯物主义,这个体系在后来的社会主义国家推动下得以发展。马克思主义哲学宣称自己的理论体系具有科学性,认为哲学可以成为科学的一部分。同时马克思主义哲学认为哲学还具有意识形态的性质。\\n\\n另外马克思主义在政治上也指各种不同的共产主义运动,如由列宁所创立而被斯大林修改的苏联马克思主义,称为马克思列宁主义,为俄国革命以及后来建立的各种共产党之教义。它的旁系包括反斯大林的托洛茨基及其追随者的马克思主义、毛泽东的马克思列宁主义等。\\n實用主義產生於19世紀70年代的現代哲學派別,在20世紀的美國成為一種主流思潮。對法律、政治、教育、社會、宗教和藝術的研究產生了很大的影響。實用主義也試圖在理性主義及經驗主義找出一條中間道路來,是「經驗主義思想方法與人類的比較具有宗教性需要的適當的調和者。」\\n\\n現象學是由德國哲學家胡塞爾在1900年提出的理論,強調對直接直觀和經驗感知的區分,認為哲學(或至少是現象學)的主要任務是釐清二者之間的關聯,並且在直觀中獲得對本質的認識。现象学是对经验结构与意识结构的哲学性研究。作为一个哲学运动,现象学于二十世纪早期由埃德蒙德·胡塞尔创立,之后被他在德国的哥廷根大学和慕尼黑大学中的一派追随者发展壮大。在此之后现象学传播到法国、美国以及其他地区,并远超出了胡塞尔早期著作的语境。 其他主要哲學家包括海德格(Martin Heidegger), 梅洛—龐蒂(Maurice Merleau-Ponty), 以及列維納斯(Emmanuel Lévinas)。\\n\\n存在主义是一个哲学的非理性主义思潮,该术语被用在十九世纪晚期到二十世纪的一些哲学家的工作上,尽管他们的学说相差巨大,但他们都相信哲学思考开始于人类主体——而不仅仅是思维主体,而且包括行为、感知、人类个体。存在主义强调个人、獨立自主和主观经验,認為人存在的意義是無法經由理性思考而得到答案。在存在主义中,个体的出发点的特征是被称为“存在的态度”,或一种面对显然是一个无意义的或荒谬的世界的迷失和混乱的感觉。很多存在主义者还认为传统的体系和哲学学术无论是内容和风格都过于抽象并远离人类经验。\\n\\n19世纪哲学家克尔凯郭尔和尼采被看作存在主义的先驱,尽管他们没有使用这个术语。然而他们的影响延伸出了存在主义思想。克尔凯郭尔著作主要针对的是黑格尔的唯心主义哲学体系,他认为其忽视或排除了人类的内在主观生命。相反克尔凯郭尔认为\"真理是主观的\",主张对一个现实的人类来说最重要的问题是处理个人与存在内在关系的问题。克尔凯郭尔作为一个基督徒相信宗教信仰的真相是一个主观问题,而且人应该用热情去深思这个问题。\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n文學\\n\\n文學(),在最廣泛的意義上,是任何單一的書面作品。更嚴格地說,文學寫作被認為是一種藝術形式,或被認為具有藝術或智力價值的任何單一作品,通常是由於以不同於普通用途的方式部署語言。它的拉丁詞根\"literatura\"/\"litteratura\"(本身起源於\"littera\":\"letter\"或\"handwriting\")被用來指代所有的書面記錄,儘管當代定義將術語擴展到包括口頭或唱歌的文本(口頭文學)。文學可以根據是虛構作品還是非虛構作品進行分類,也可以根據是韻文還是散文進行分類;可以根據長篇小說、中篇小說、短篇小說等主要形式進一步區分;作品往往根據歷史時期或者遵守某些美學特徵或期望(藝術類型)進行分類。以语言文字为工具形象化地反映現實的藝術,包括韻文、散文、劇本、小说等,是文化的重要表现形式,以不同的流派表现内心情感和再现一定时期和一定地域的生活。\\n\\n這個概念隨著時間的推移而改變了意義:現在它可以擴大到非書面的口頭藝術形式,可以與語言或文字本身配合,因此很難就其起源達成一致。印刷技術的發展使得書面作品的分佈和擴散成為可能,最終導致了網絡文學。\\n\\n文學並不一定是客觀的,一名成功的文學家能在自己的文學作品中,展現自己對於文學的主觀看法,抒發自己的情緒和感觸,但藉由嘗試建立一個「客觀的標準」,有時對能幫助作家了解「讀者的感受」以求將內心之情感與藝術表現完整的體現在讀者心中。有時也能藉作家主觀想法帶給社會不同面相去省思現況,例如女性文學的興起。\\n\\n文學的歷史和文明發展有密切的關係。若將文學定義為用文字記錄的作品,最早的古代文學作品一般認為是古埃及文學及。古埃及文學中主要的文類(讚美詩、祈禱文及故事)幾乎都是以詩的方式寫成的,不過雖然可以清楚看出有使用詩歌技巧(poetic devices),但詩歌的韻律不明。最早已知的文學作品是公元前2700年一篇由蘇美人创作的《吉爾伽美什史詩》,當中描述英雄主義、友誼、損失及追逐永生。\\n\\n不同的歷史時期有著不同特色的文學。古代的文學中有許多有關世界起源及習俗起源內容,也有一些其中有道德及靈性意涵的神話。铁器时代的荷马史詩及以較晚一些的有較多有關作者的資訊,而許多的神話則是用口頭傳播的方式流傳下來。\\n\\n各種文學都可以視為是文字的紀錄,文學本身可能是寫實或是虛構,但都可以描繪出一些事實,例如主角的動作及言語、作者的寫作風格,以及文字後的含義等。這些情節不只是娛樂性的,其中也包括了經濟、心理、科學、宗教、政治、文化及社學的相關資訊。在學習歷史時,研究及分析當時的文學也是重要的一部份。研究過去的文學可以看到不同歷史時期時,其社會和社會規範的演變,甚至於也可助於瞭解現今的文學,因為其中常常引用古希臘神話、宗教典藉及相關文獻的資料。人們不止可以從各主題相關的文學中看到該主題隨著歷史的演進(例如從經濟史的書或介紹科學及演化的書),甚至連科幻小說中都可看到類似的內容。作者常常在其作品中加入一些歷史的內容,例如拜倫勳爵在《Childe Harold’s Pilgrimage: Canto I》中藉由主角Childe Harold提到西班牙文及法文,也提到作者的一些想法。藉由文學人們可以繼續的發現有關歷史的新資訊,這個從各個學科領域都有和文學相關的子領域可以看出。當人們將資訊用文字的方式紀錄下來,就比較容易從這一代流傳到下一代,留下來的資訊會越來越多。從這些資料,人們可以研究文學、提昇想法、擴展知識、也可以開始像醫學或是貿易等專業領域的研究。而隨著現代人們學習內容的增加及拓展,文學也會有一些不同,成為以後人們研究的基礎。\\n\\n許多古文明都有其對哲學或是相關觀點的文學,像是古中國、古印度、波斯時及希臘羅馬古典时代的作品。許多古代的作品,就算是敘事的形式,都還是有道德或是教誨上的目的,像梵語的《五卷书》或是奥维德的《变形记》,後來戏剧及讽刺作品的受眾也變多,因此也開始有類似性質的文學創作。抒情诗常常是貴族圈的特性產物,特別在東亞,許多歌曲被貴族收集,成為詩歌。\\n\\n浪漫主義的異常特質在中世紀綻放。同時,理性時代造就了民族主義史詩與哲學短文。浪漫主義強調通俗的文學及情感的投入,慢慢被尋求真實的現實主義與自然主義文學所取代。到了20世紀,象徵主義抬頭,探索角色的描述和發展。\\n\\n在很长一段时间,中国的文学与史学和神话并无明显的界限,最早的文学是对历史和神话的记录。但纯粹的文学早在周时就已出现,例如《诗經》。中國古代的文學主要著重在哲學、史学史、军事学、農業及韻文。中國發明了造紙術及雕版印刷,也是世界上第一個。中國的許多哲學思想是起源自春秋戰國時的诸子百家,其中最重要的有儒家、道家、墨家及法家,而軍事學書籍(如孙子兵法)也是在春秋戰國時開始出現。中国历史文學則從尚書、春秋、戰國策、史記等一直延續下來,而且有很詳細的資料記錄。\\n\\n中国的文学成就最大的是诗歌,从《离骚》到唐代律诗,诗歌一直对中国文坛有着巨大的影响。后来诗、词、曲、小说等文学形式分别在唐、宋、元、明清达到高峰。民國時期由胡適和陳獨秀推動的新文學運動,認為作品不應只講求形式,應注重內容的充實、表達及情感,也推動白話文學。民国时期,武侠小说风靡海内外,成为当时最受欢迎的通俗小说。\\n\\n中华人民共和国时期,在文化大革命後,出現相關的反思文學及伤痕文学,也有一批白话文诗人进行大量创作,也取代古诗成为当时最欢迎的诗歌作品。後來網路文學興起,成為受歡迎的商業作品。\\n\\n中華民國在撤退台灣後,在50及60年代出現了以四大抗戰小說為代表的戰鬥文藝小說,都是以抗戰時期為背景,後來又有反共文學的出現,而60年代開始,以瓊瑤為代表的言情小說也開始行。70年代起逐漸開始有對於台灣社會研究的新現代文學,以及強調鄉土的鄉土寫實文學,1990年後也開始了網路文學的興起。\\n\\n\\n中国古典文學分为诗和文,文又分为韵文和散文,中国的抒情诗和文言文最早而比较发达。\\n\\n文學一般分为小說、散文、詩歌、劇本,并称为四大文学体裁;\\n\\n\\n\\n劇本是另一种古老的文学形式,主要通过不同角色之间的对话来表达作者的思想和感情。劇本可以用于舞台的表演,也可以阅读。像元曲、京剧、昆剧都屬於這個部份。西方的戲劇許多都伴隨著音樂和舞蹈,例如歌劇及音樂劇,古希腊戏剧是目前已知最早期的西方戲劇,有悲劇、喜劇、悲喜劇等。\\n\\n\\n\\n有許多的文學獎,頒發給優秀的作家,表揚其文學的成就。因為文學的範圍很廣,許多文學獎項會依風格、文學類型、語言、國籍及其他特性(例如新進作家或是等)再做分類。\\n\\n諾貝爾文學獎是依諾貝爾在1895年的遺囑所成立的獎項,是諾貝爾獎中的一項,一般是因為作者的整體作品而獲獎,而非著重特定的作品。其他不分國籍的獎項有:纽斯塔特国际文学奖、布克國際獎及卡夫卡獎。\\n\\n\\n是文學創作者應用在文學中,製造特別效果的方式。文學技巧的範圍很廣,包括作品是否要用第一人稱或是其他人稱、用傳統的線性敘事或是、或是文類選擇都包括在內。這可以讓讀者感受到一些熟悉的結構及架構,例如傳統犯罪小說,不過有些作者會特別選擇一些文學技巧來讓讀者有意外的感受。\\n\\n文學技巧的使用也可能會產生新的文類,就像塞繆爾·理查森寫的早期現代小說《》一様。《Pamela》是用許多的信件組成,稱為「書信體技巧」(epistolary technique)。因此《Pamela》讓大家再次注意到,一個以往曾出現,但沒有這麼受注意的文類。\\n\\n文學技巧和文学手段(literary device)不同,有點類似軍事戰略和軍事戰術之間的關係。文学手段是在敘述中用的特殊結構,像是隐喻、明喻、省略、敘事及託寓等,甚至單純的諧音都可以作為文学手段。也可以視為是文学手段,例如意識流敘事。\\n\\n文學批評是指文學批評者對其他人作品的評論和評估,有時也會用來改進及提昇文學作品。也可以對作者帶來類似的作用。有許多不同種類的文學批評,背後會有其理論基礎,不同種類的文學批評可以評論文學作品的各個部份或是各個層面。\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n历史\\n\\n歷史(现代汉语词汇,古典文言文称之为史),指人类社会过去的事件和行动,以及对这些事件行为有系统的记录、诠释和研究。歷史可提供今人理解過去,作為未來行事的參考依據,与伦理、哲学和艺术同属人类精神文明的重要成果。历史的第二个含'" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Machine Learning Quick Review" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10k_articles.txt lesson01-Part01.pdf\r\n", + "80k_articles.txt lesson01-Part02.pdf\r\n", + "Lecture-01-An Introduction to AI.ipynb lesson01.pdf\r\n", + "Untitled.ipynb regression_example.py\r\n", + "Untitled1.ipynb sqlResult_1558435.csv\r\n" + ] + } + ], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [], + "source": [ + "titanic_content = pd.read_csv(open('../../datasource/titanic_train.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
6701McCarthy, Mr. Timothy Jmale54.0001746351.8625E46S
7803Palsson, Master. Gosta Leonardmale2.03134990921.0750NaNS
8913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female27.00234774211.1333NaNS
91012Nasser, Mrs. Nicholas (Adele Achem)female14.01023773630.0708NaNC
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "5 6 0 3 \n", + "6 7 0 1 \n", + "7 8 0 3 \n", + "8 9 1 3 \n", + "9 10 1 2 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "5 Moran, Mr. James male NaN 0 \n", + "6 McCarthy, Mr. Timothy J male 54.0 0 \n", + "7 Palsson, Master. Gosta Leonard male 2.0 3 \n", + "8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n", + "9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S \n", + "5 0 330877 8.4583 NaN Q \n", + "6 0 17463 51.8625 E46 S \n", + "7 1 349909 21.0750 NaN S \n", + "8 2 347742 11.1333 NaN S \n", + "9 0 237736 30.0708 NaN C " + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_content[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [], + "source": [ + "titanic_content = pd.read_csv(open('../../datasource/titanic_train.csv'))\n", + "titanic_content = titanic_content.dropna()\n", + "age_with_fare = titanic_content[['Age', 'Fare']]\n", + "age_with_fare = age_with_fare[ (age_with_fare['Age'] > 22) & (age_with_fare['Fare'] < 400) & (age_with_fare['Fare'] > 130)]\n", + "age = age_with_fare['Age']\n", + "fare = age_with_fare['Fare']" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "metadata": {}, + "outputs": [], + "source": [ + "L1 = np.array([1,2, 3])" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [], + "source": [ + "L2 = np.mean(np.array([2, 3, 4]))" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.0" + ] + }, + "execution_count": 211, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "L2" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 1, 1])" + ] + }, + "execution_count": 208, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.abs((L2 - L1)) ** 2\n", + "\n", + "def loss(y_true, yhats): return np.mean(np.abs(y_true - yhats))" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": {}, + "outputs": [], + "source": [ + "## boradcast" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 198, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFCVJREFUeJzt3X+MHOd93/H3pxTtXuOkZ4enVPwVyoHMWv4RUT2rbIWmttyEkmuYhJEAChpbUI0QDdhUCmy6pg3ESAHBbhjYiRBEgBCptlDVrmrTtJAoZRRHreGiknoUJVESw5qtHYtHOqTh0nari0zR3/6xc9LqdOTu3q+9G75fwOFmn5lZfG9u97Ozzzy7T6oKSVJ7/Y1hFyBJWlwGvSS1nEEvSS1n0EtSyxn0ktRyBr0ktZxBL0ktZ9BLUssZ9JLUcpcMuwCANWvW1KZNm4ZdhiStKAcPHvxOVY312m5ZBP2mTZuYmJgYdhmStKIk+ct+trPrRpJazqCXpJYz6CWp5Qx6SWo5g16SWm5ZjLqZi/2HJtl74CgnzkyxdnSE3ds2s2PLur7XS9IwLWVGrcig339okj37DjN19hwAk2em2LPvMAA7tqzruV6ShmmpM2pFdt3sPXD0xQM0bersOfYeONrXekkapqXOqBUZ9CfOTF2wvdd6SRqmpc6onkGfZEOSh5IcSfJ0klu61v16kqNN+293te9JcqxZt22hi147OnLB9l7rJWmYljqj+jmjfwH4YFW9EdgK7EpyZZJ3ANuBt1bVm4DfAUhyJXAj8CbgeuAPkqxayKJ3b9vMyOqX3+XI6lXs3ra5r/WSNExLnVE9L8ZW1UngZLP8gyRHgHXArwKfrKrnm3Wnml22A59v2r+R5BhwDfDfF6ro6YsV57ti3Wu9JA3TUmdUqqr/jZNNwFeBNze/v0znrP2vgQ9V1f9I8vvAw1X175t97gL+pKq+cL77HR8fL7/UTJIGk+RgVY332q7v4ZVJXgN8Ebi1qr6f5BLgtXS6c94G3Jfk9UBm2f0VryZJdgI7ATZu3NhvGZKkAfU16ibJajohf29V7WuajwP7quNR4EfAmqZ9Q9fu64ETM++zqu6sqvGqGh8b6/l1ypKkOepn1E2Au4AjVfWprlX7geuabd4AvAr4DnA/cGOSVye5HLgCeHShC5ck9aefrptrgfcBh5M83rR9FLgbuDvJU8APgZuq0+H/dJL7gGfojNjZVVXnZrlfSdIS6GfUzdeYvd8d4FfOs89twG3zqEuStEBW5CdjJUn9M+glqeUMeklqOYNeklrOoJekljPoJanlDHpJajmDXpJazqCXpJYz6CWp5Qx6SWq5vr+PfqXZf2jSGaZ0UfM5oGmtDPr9hybZs+8wU2c7X5o5eWaKPfsOA/hA10XB54C6tbLrZu+Boy8+wKdNnT3H3gNHh1SRtLR8DqhbK4P+xJmpgdqltvE5oG6tDPq1oyMDtUtt43NA3VoZ9Lu3bWZk9aqXtY2sXsXubZuHVJG0tHwOqFsrL8ZOX2xyxIEuVj4H1C2daV6Ha3x8vCYmJoZdhiStKEkOVtV4r+1a2XUjSXqJQS9JLWfQS1LL9Qz6JBuSPJTkSJKnk9wyY/2HklSSNc3tJLk9ybEkTya5erGKlyT11s+omxeAD1bVY0l+HDiY5MGqeibJBuDngW91bX8DcEXz8/eBO5rfkqQh6HlGX1Unq+qxZvkHwBFgeozWp4EPA91Dd7YD91THw8BokssWtmxJUr8G6qNPsgnYAjyS5D3AZFU9MWOzdcCzXbeP89ILgyRpifX9gakkrwG+CNxKpzvnY8AvzLbpLG2vGKyfZCewE2Djxo39liFJGlBfZ/RJVtMJ+Xurah/wM8DlwBNJvgmsBx5L8nfonMFv6Np9PXBi5n1W1Z1VNV5V42NjY/P7KyRJ59XPqJsAdwFHqupTAFV1uKourapNVbWJTrhfXVXfBu4H3t+MvtkKfK+qTi7enyBJupB+um6uBd4HHE7yeNP20ap64DzbPwC8CzgGPAfcPO8qJUlz1jPoq+przN7v3r3Npq7lAnbNuzJJ0oLwk7GS1HIGvSS1nEEvSS1n0EtSyxn0ktRyBr0ktZxBL0ktZ9BLUssZ9JLUcga9JLWcQS9JLdf399FLC2n/oUn2HjjKiTNTrB0dYfe2zezY4vw00mIw6LXk9h+aZM++w0ydPQfA5Jkp9uw7DGDYS4vArhstub0Hjr4Y8tOmzp5j74GjQ6pIajeDXkvuxJmpgdolzY9BryW3dnRkoHZJ82PQa8nt3raZkdWrXtY2snoVu7dtHlJFUrt5MVZLbvqCq6NupKVh0GsodmxZZ7BLS8SuG0lqOYNeklrOoJeklusZ9Ek2JHkoyZEkTye5pWnfm+QvkjyZ5EtJRrv22ZPkWJKjSbYt5h8gSbqwfs7oXwA+WFVvBLYCu5JcCTwIvLmq3gr8T2APQLPuRuBNwPXAHyRZNes9S5IWXc+gr6qTVfVYs/wD4Aiwrqr+tKpeaDZ7GFjfLG8HPl9Vz1fVN4BjwDULX7okqR8D9dEn2QRsAR6ZseqfA3/SLK8Dnu1ad7xpm3lfO5NMJJk4ffr0IGVIkgbQd9AneQ3wReDWqvp+V/vH6HTv3DvdNMvu9YqGqjuraryqxsfGxgarWpLUt74+MJVkNZ2Qv7eq9nW13wS8G3hnVU2H+XFgQ9fu64ETC1OuJGlQ/Yy6CXAXcKSqPtXVfj3wr4H3VNVzXbvcD9yY5NVJLgeuAB5d2LIlSf3q54z+WuB9wOEkjzdtHwVuB14NPNh5LeDhqvoXVfV0kvuAZ+h06eyqqnOz3K8kaQn0DPqq+hqz97s/cIF9bgNum0ddkqQF4idjJanlDHpJajmDXpJazqCXpJZz4pEB7T806cxIklYUg34A+w9NsmffYabOdkaLTp6ZYs++wwB9hb0vEsub/x+1lV03A9h74OiLIT9t6uw59h442nPf6ReJyTNTFC+9SOw/NLlI1WoQ/n/UZgb9AE6cmRqovdt8XiS0+Pz/qM0M+gGsHR0ZqL3bfF4ktPj8/6jNDPoB7N62mZHVL59DZWT1KnZv29xz3/m8SGjx+f9Rmxn0A9ixZR2feO9bWDc6QoB1oyN84r1v6euC3XxeJLT4/P+ozRx1M6AdW9bNaSTG9D6O6lie/P+ozfLS18gPz/j4eE1MTAy7DElaUZIcrKrxXtvZdSNJLWfQS1LLGfSS1HIGvSS1nEEvSS1n0EtSyxn0ktRyBr0ktVzPoE+yIclDSY4keTrJLU3765I8mOTrze/XNu1JcnuSY0meTHL1Yv8RkqTz6+eM/gXgg1X1RmArsCvJlcBHgK9U1RXAV5rbADcAVzQ/O4E7FrxqSVLfegZ9VZ2sqsea5R8AR4B1wHbgs81mnwV2NMvbgXuq42FgNMllC165JKkvA/XRJ9kEbAEeAX6qqk5C58UAuLTZbB3wbNdux5s2SdIQ9B30SV4DfBG4taq+f6FNZ2l7xTenJdmZZCLJxOnTp/stQ5I0oL6CPslqOiF/b1Xta5r/arpLpvl9qmk/Dmzo2n09cGLmfVbVnVU1XlXjY2Njc61fktRDP6NuAtwFHKmqT3Wtuh+4qVm+CfhyV/v7m9E3W4HvTXfxSJKWXj8Tj1wLvA84nOTxpu2jwCeB+5J8APgW8EvNugeAdwHHgOeAmxe0YknSQHoGfVV9jdn73QHeOcv2BeyaZ12SpAXiJ2MlqeUMeklqOYNeklrOoJekljPoJanlDHpJajmDXpJazqCXpJYz6CWp5Qx6SWo5g16SWs6gl6SWM+glqeUMeklqOYNeklrOoJekljPoJanlDHpJajmDXpJazqCXpJYz6CWp5Qx6SWq5nkGf5O4kp5I81dV2VZKHkzyeZCLJNU17ktye5FiSJ5NcvZjFS5J66+eM/jPA9TPafhv4raq6CvjN5jbADcAVzc9O4I6FKVOSNFc9g76qvgp8d2Yz8BPN8t8GTjTL24F7quNhYDTJZQtVrCRpcJfMcb9bgQNJfofOi8U/bNrXAc92bXe8aTs55wolSfMy14uxvwb8RlVtAH4DuKtpzyzb1mx3kGRn078/cfr06TmWIUnqZa5BfxOwr1n+T8A1zfJxYEPXdut5qVvnZarqzqoar6rxsbGxOZYhSeplrkF/AvjHzfJ1wNeb5fuB9zejb7YC36squ20kaYh69tEn+RzwdmBNkuPAx4FfBX4vySXAX9MZYQPwAPAu4BjwHHDzItQsSRpAz6Cvql8+z6q/N8u2Beyab1GSpIXjJ2MlqeUMeklqOYNeklrOoJekljPoJanlDHpJajmDXpJazqCXpJYz6CWp5Qx6SWo5g16SWs6gl6SWM+glqeUMeklqOYNeklpurpODa4ntPzTJ3gNHOXFmirWjI+zetpkdW9YNuyxJK4BBvwLsPzTJnn2HmTp7DoDJM1Ps2XcYwLCX1JNdNyvA3gNHXwz5aVNnz7H3wNEhVSRpJTHoV4ATZ6YGapekbgb9CrB2dGSgdknqZtCvALu3bWZk9aqXtY2sXsXubZuHVJGklcSLsSvA9AVXR91ImoueQZ/kbuDdwKmqenNX+68D/xJ4Afjjqvpw074H+ABwDvhXVXVgMQq/2OzYss5glzQn/ZzRfwb4feCe6YYk7wC2A2+tqueTXNq0XwncCLwJWAv8WZI3VNW5V9yrJGlJ9Oyjr6qvAt+d0fxrwCer6vlmm1NN+3bg81X1fFV9AzgGXLOA9UqSBjTXi7FvAP5RkkeS/Nckb2va1wHPdm13vGmTJA3JXC/GXgK8FtgKvA24L8nrgcyybc12B0l2AjsBNm7cOMcyJEm9zPWM/jiwrzoeBX4ErGnaN3Rttx44MdsdVNWdVTVeVeNjY2NzLEOS1Mtcg34/cB1AkjcArwK+A9wP3Jjk1UkuB64AHl2IQiVJc9PP8MrPAW8H1iQ5DnwcuBu4O8lTwA+Bm6qqgKeT3Ac8Q2fY5S5H3EjScKWTz8M1Pj5eExMTwy5DklaUJAerarzXdn4FgiS1nEEvSS3nd91oKOYzY5azbWkmHxMXZtBryc1nxixn29JMPiZ6s+tGS24+M2Y525Zm8jHR20V5Ru/bvOGaz4xZzralmXxM9HbRndFPv82bPDNF8dLbvP2HJodd2kVjPjNmOduWZvIx0dtFF/S+zRu++cyY5WxbmsnHRG8XXdeNb/OGbz4zZjnblmbyMdHbRffJ2Gs/+edMzhLq60ZH+G8fuW5JapCkheAnY8/Dt3mSLjYXXdeNb/MkXWwuuqAHJ9qW2sYh0xd2UQa9pPbwk7G9XXR99JLaxSHTvRn0klY0h0z3ZtBLWtH8ZGxvBr2kFc0h0715MVbSiuaQ6d4MekkrnkOmL8yuG0lqOYNeklquZ9AnuTvJqSRPzbLuQ0kqyZrmdpLcnuRYkieTXL0YRUuS+tfPGf1ngOtnNibZAPw88K2u5huAK5qfncAd8y9RkjQfPYO+qr4KfHeWVZ8GPgx0f8/xduCe6ngYGE1y2YJUKkmakzn10Sd5DzBZVU/MWLUOeLbr9vGmbbb72JlkIsnE6dOn51KGJKkPAwd9kr8FfAz4zdlWz9I268wmVXVnVY1X1fjY2NigZUiS+jSXcfQ/A1wOPJEEYD3wWJJr6JzBb+jadj1wYr5FSpLmbuAz+qo6XFWXVtWmqtpEJ9yvrqpvA/cD729G32wFvldVJxe2ZEnSIHqe0Sf5HPB2YE2S48DHq+qu82z+APAu4BjwHHDzAtWpIbnQhA5O9iCtDD2Dvqp+ucf6TV3LBeyaf1laDi40oQPgZA/SCuF33ei8ek3ocL51ix30vsuQBmPQ67zmMqHDYk/24LsMaXAGvc5r7egIk7ME9/SEDhdat1iW67sMaTnzS810Xhea0GFYkz1c6F2GU8pJs/OMXufVz4QOS90fvhzfZUjLXToDZYZrfHy8JiYmhl2GVoCZffTQeSfxife+BeC86+y6URslOVhV472284xeK8pyfJchLXee0UvSCtXvGb0XYyWp5Qx6SWo5g16SWs6gl6SWM+glqeWWxaibJKeBvxx2HcvEGuA7wy5ihfBY9cfj1J+VeJx+uqp6TtG3LIJeL0ky0c9wKXms+uVx6k+bj5NdN5LUcga9JLWcQb/83DnsAlYQj1V/PE79ae1xso9eklrOM3pJajmDfoiS/M0kjyZ5IsnTSX6rab88ySNJvp7kPyZ51bBrXQ6SrEpyKMkfNbc9TjMk+WaSw0keTzLRtL0uyYPNcXowyWuHXeewJRlN8oUkf5HkSJJ/0ObjZNAP1/PAdVX1s8BVwPVJtgL/Fvh0VV0B/B/gA0OscTm5BTjSddvjNLt3VNVVXUMFPwJ8pTlOX2luX+x+D/jPVfV3gZ+l87hq7XEy6IeoOv5vc3N181PAdcAXmvbPAjuGUN6ykmQ98E+BP2xuB49Tv7bTOT7gcSLJTwA/B9wFUFU/rKoztPg4GfRD1nRHPA6cAh4E/hdwpqpeaDY5DjhzBvwu8GHgR83tn8TjNJsC/jTJwSQ7m7afqqqTAM3vS4dW3fLweuA08O+arsA/TPJjtPg4GfRDVlXnquoqYD1wDfDG2TZb2qqWlyTvBk5V1cHu5lk2vaiPU+PaqroauAHYleTnhl3QMnQJcDVwR1VtAf4fLeqmmY1Bv0w0bx3/C7AVGE0yPc3jeuDEsOpaJq4F3pPkm8Dn6XTZ/C4ep1eoqhPN71PAl+icPPxVkssAmt+nhlfhsnAcOF5VjzS3v0An+Ft7nAz6IUoylmS0WR4B/gmdi0IPAb/YbHYT8OXhVLg8VNWeqlpfVZuAG4E/r6p/hsfpZZL8WJIfn14GfgF4CrifzvEBjxNV9W3g2SSbm6Z3As/Q4uPkB6aGKMlb6Vz0WUXnRfe+qvo3SV5P58z1dcAh4Feq6vnhVbp8JHk78KGqerfH6eWa4/Gl5uYlwH+oqtuS/CRwH7AR+BbwS1X13SGVuSwkuYrOhf1XAf8buJnmOUgLj5NBL0ktZ9eNJLWcQS9JLWfQS1LLGfSS1HIGvSS1nEEvSS1n0EtSyxn0ktRy/x8ZuDYH17yb9wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(age, fare)" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb b/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb new file mode 100644 index 0000000..0155d67 --- /dev/null +++ b/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb @@ -0,0 +1,453 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "from collections import Counter\n", + "from collections import defaultdict\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# wikipedia, smoothing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Processing Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 运行 python WikiExtractor.py -b 2000M zhwiki-20181101-pages-articles.xml.bz2\n", + "# 没有做繁体转简体处理,不会安装opencc的windows版本 :-(" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "all_wiki_content = open('D://pyproject//git//AI-NLP//data//text//AA//wiki_00',encoding='UTF-8').read()\n", + "all_wiki_content = re.sub(r'<[^>]+>','',all_wiki_content) # 去掉 tag" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize(string): \n", + " #return ''.join(re.findall('[\\w|\\d]+', string)) # 不太明白两种写法的区别,w也能匹配数字,但输出结果是有不同\n", + " return ''.join(re.findall('\\w+', string))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "380434793" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_character = tokenize(all_wiki_content)\n", + "del all_wiki_content # 释放内存\n", + "len(all_character)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('的', 9938192),\n", + " ('1', 5740539),\n", + " ('0', 4559519),\n", + " ('年', 4088849),\n", + " ('2', 3705103),\n", + " ('一', 3174566),\n", + " ('在', 3142422),\n", + " ('是', 2800422),\n", + " ('中', 2763222),\n", + " ('9', 2730241),\n", + " ('人', 2610319),\n", + " ('大', 2095073),\n", + " ('有', 2064509),\n", + " ('e', 1885083),\n", + " ('a', 1789303),\n", + " ('3', 1753587),\n", + " ('5', 1721315),\n", + " ('和', 1705550),\n", + " ('為', 1662714),\n", + " ('8', 1646008)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_character_counts = Counter(all_character)\n", + "all_character_counts.most_common()[0:20]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('20', 1579014),\n", + " ('19', 1442094),\n", + " ('00', 1225241),\n", + " ('01', 853922),\n", + " ('10', 547006),\n", + " ('年1', 527492),\n", + " ('的一', 469028),\n", + " ('12', 444080),\n", + " ('11', 419457),\n", + " ('0年', 417267),\n", + " ('一个', 400248),\n", + " ('18', 387729),\n", + " ('人口', 349391),\n", + " ('99', 340092),\n", + " ('中国', 328509),\n", + " ('1年', 322136),\n", + " ('公里', 320126),\n", + " ('5年', 318534),\n", + " ('月1', 318147),\n", + " ('er', 316517)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gram_length = 2\n", + "two_gram_counts = {}\n", + "for i in range(len(all_character)-gram_length): # 用 for循环省内存\n", + " k = all_character[i:i+gram_length]\n", + " two_gram_counts[k] = (two_gram_counts[k]+ 1) if (k in two_gram_counts.keys()) else 1\n", + " \n", + "two_gram_counts = Counter(two_gram_counts)\n", + "two_gram_counts.most_common()[0:20]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unigram, Good-Turing smoothing" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21491" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_character_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def get_char_prob_from_counts(counts,k=5): \n", + " # Good-Turing smoothing\n", + " n = sum(counts.values())\n", + " # 计算nr\n", + " nr = defaultdict(int)\n", + " for i in counts.values():\n", + " nr[i] += 1\n", + " nr[0] = 90000 - len(counts) # 假设汉字共9万个\n", + " # 计算rstar\n", + " rstar = [0]*(k+1) \n", + " total_decreased = 0\n", + " for r in range(1,k+1,1):\n", + " rstar[r] = (r+1)*nr[r+1]/nr[r]\n", + " total_decreased += (r*nr[r] - rstar[r]*nr[r])\n", + " #print(r,rstar[r])\n", + " rstar[0] = total_decreased / nr[0]\n", + " #print (0,rstar[0])\n", + " def get_prob(char):\n", + " occurence = counts.get(char,0)\n", + " return rstar[occurence]/n if occurence<=k else occurence/n\n", + " return get_prob\n", + "\n", + "get_char_prob = get_char_prob_from_counts(all_character_counts,k=5)\n", + "\n", + "from functools import reduce\n", + "from operator import mul\n", + "def get_1_gram_string_prob(string):\n", + " return reduce(mul,[get_char_prob(char) for char in string])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "pair = \"\"\"前天晚上吃晚饭的时候\n", + "前天晚上吃早饭的时候\"\"\".split('\\n')\n", + "\n", + "pair2 = \"\"\"正是一个好看的小猫\n", + "真是一个好看的小猫\"\"\".split('\\n')\n", + "\n", + "pair3 = \"\"\"我无言以对,简直\n", + "我简直无言以对\"\"\".split('\\n')\n", + "\n", + "pairs = [pair, pair2, pair3]\n", + "def get_probability_prefromance(language_model_func, pairs):\n", + " for (p1, p2) in pairs:\n", + " print('*'*18)\n", + " print('\\t\\t {} with probability {}'.format(p1, language_model_func(tokenize(p1)))) # tokenize去掉','这样的标点\n", + " print('\\t\\t {} with probability {}'.format(p2, language_model_func(tokenize(p2))))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 2.3223066267509665e-33\n", + "\t\t 前天晚上吃早饭的时候 with probability 4.678562566970852e-33\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 1.1087006396816684e-26\n", + "\t\t 真是一个好看的小猫 with probability 3.4663369707956e-27\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 1.747335364002409e-23\n", + "\t\t 我简直无言以对 with probability 1.747335364002409e-23\n" + ] + } + ], + "source": [ + "get_probability_prefromance(get_1_gram_string_prob, pairs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2-gram, Katz back-off smoothing" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "two_gram_table ={}\n", + "for w in two_gram_counts.keys():\n", + " if w[0] not in two_gram_table.keys():\n", + " two_gram_table[w[0]] = {}\n", + " two_gram_table[w[0]][w[1]] = two_gram_counts[w]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def get_2_gram_prob_from_counts(counts,k=5): \n", + " # Katz smoothing\n", + " n = sum(counts.values())\n", + " # 计算nr\n", + " nr = defaultdict(int)\n", + " for i in counts.values():\n", + " nr[i] += 1\n", + "\n", + " # 计算dr, for 1<= r<=k\n", + " dr = [1]*(k+1) # dr[0] 不使用\n", + " tmp = (k+1)*nr[k+1]/nr[1]\n", + " for r in range(1,k+1,1):\n", + " rstar = (r+1)*nr[r+1]/nr[r]\n", + " dr[r] = (rstar/r-tmp)/(1-tmp)\n", + "\n", + " # 计算 two_gram_table,最终存储了所有pair的条件概率\n", + " # 计算 a,回退到unigram的系数\n", + " two_gram_table ={}\n", + " a = {}\n", + " for w in counts.keys():\n", + " if w[0] not in two_gram_table.keys():\n", + " two_gram_table[w[0]] = {}\n", + " two_gram_table[w[0]][w[1]] = counts[w]\n", + " for w0 in two_gram_table.keys():\n", + " n0 = sum(two_gram_table[w0].values())\n", + " for w1 in two_gram_table[w0].keys():\n", + " c = two_gram_table[w0][w1]\n", + " if c > k:\n", + " two_gram_table[w0][w1] = c/n0\n", + " else:\n", + " two_gram_table[w0][w1] = dr[c]*c/n0\n", + " sumkatz = sum(two_gram_table[w0].values())\n", + " sumSeenUnigram = sum(get_char_prob(e) for e in two_gram_table[w0].keys())\n", + " a[w0] = (1-sumkatz)/(1-sumSeenUnigram)\n", + " \n", + " \n", + " def get_prob(word,prev):\n", + " occurence = counts.get(prev+word,0)\n", + " if occurence > 0:\n", + " return two_gram_table[prev][word]\n", + " elif prev == '':\n", + " return get_char_prob(word)\n", + " else:\n", + " return a[prev]*get_char_prob(word) \n", + " return get_prob\n", + "\n", + "get_2_gram_prob = get_2_gram_prob_from_counts(two_gram_counts,k=5)\n", + "\n", + "def get_2_gram_string_prob(string):\n", + " probList = []\n", + " for i,c in enumerate(string):\n", + " prev = '' if i == 0 else string[i-1]\n", + " probList.append(get_2_gram_prob(c,prev))\n", + " return reduce(mul,probList)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************\n", + "\t\t 前天晚上吃晚饭的时候 with probability 2.8863673714341063e-24\n", + "\t\t 前天晚上吃早饭的时候 with probability 6.446398053347142e-25\n", + "******************\n", + "\t\t 正是一个好看的小猫 with probability 6.4603668266917246e-21\n", + "\t\t 真是一个好看的小猫 with probability 9.252304558785208e-22\n", + "******************\n", + "\t\t 我无言以对,简直 with probability 9.167158643679254e-21\n", + "\t\t 我简直无言以对 with probability 7.569514628385958e-22\n" + ] + } + ], + "source": [ + "get_probability_prefromance(get_2_gram_string_prob, pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From eef912cd634ddcd19f3e98be9851e99af141eee0 Mon Sep 17 00:00:00 2001 From: LIWEN CHENG Date: Fri, 30 Nov 2018 13:21:35 +0800 Subject: [PATCH 2/2] lecture2 --- ...Lecture-2-Language-Model-ustccheng02.ipynb | 2609 ----------------- ...re-2-Wikipedia-Smoothing-ustccheng02.ipynb | 65 +- 2 files changed, 25 insertions(+), 2649 deletions(-) diff --git a/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb b/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb index b873719..d513d97 100644 --- a/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb +++ b/2018-autumn/Lecture-2-Language-Model-ustccheng02.ipynb @@ -2531,2615 +2531,6 @@ "source": [ "get_probability_prefromance(get_2_gram_string_prob, pairs)" ] - }, - { - "cell_type": "code", - "execution_count": 129, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2.057921115909864e-15" - ] - }, - "execution_count": 127, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 128, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.6195189821101717e-12" - ] - }, - "execution_count": 128, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 149, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 152, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 153, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "380434793" - ] - }, - "execution_count": 153, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 154, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 155, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('的', 9938192),\n", - " ('1', 5740539),\n", - " ('0', 4559519),\n", - " ('年', 4088849),\n", - " ('2', 3705103),\n", - " ('一', 3174566),\n", - " ('在', 3142422),\n", - " ('是', 2800422),\n", - " ('中', 2763222),\n", - " ('9', 2730241),\n", - " ('人', 2610319),\n", - " ('大', 2095073),\n", - " ('有', 2064509),\n", - " ('e', 1885083),\n", - " ('a', 1789303),\n", - " ('3', 1753587),\n", - " ('5', 1721315),\n", - " ('和', 1705550),\n", - " ('為', 1662714),\n", - " ('8', 1646008),\n", - " ('以', 1611294),\n", - " ('4', 1579101),\n", - " ('月', 1527145),\n", - " ('日', 1511531),\n", - " ('6', 1501304),\n", - " ('o', 1452120),\n", - " ('7', 1440467),\n", - " ('了', 1434073),\n", - " ('国', 1417423),\n", - " ('i', 1396361),\n", - " ('为', 1360856),\n", - " ('n', 1348198),\n", - " ('r', 1329195),\n", - " ('地', 1309342),\n", - " ('於', 1292718),\n", - " ('國', 1290442),\n", - " ('成', 1267631),\n", - " ('公', 1228434),\n", - " ('上', 1220226),\n", - " ('不', 1198193),\n", - " ('出', 1152193),\n", - " ('行', 1134022),\n", - " ('他', 1128358),\n", - " ('部', 1117122),\n", - " ('斯', 1107951),\n", - " ('t', 1088959),\n", - " ('之', 1060971),\n", - " ('名', 1027713),\n", - " ('其', 994654),\n", - " ('主', 993612),\n", - " ('作', 977896),\n", - " ('生', 967962),\n", - " ('l', 964757),\n", - " ('分', 953857),\n", - " ('方', 948753),\n", - " ('于', 929041),\n", - " ('s', 920003),\n", - " ('下', 915048),\n", - " ('西', 912137),\n", - " ('用', 904531),\n", - " ('到', 904395),\n", - " ('第', 898354),\n", - " ('家', 865004),\n", - " ('位', 862875),\n", - " ('而', 862821),\n", - " ('及', 857057),\n", - " ('由', 856546),\n", - " ('後', 834189),\n", - " ('時', 830986),\n", - " ('被', 817636),\n", - " ('本', 810581),\n", - " ('民', 808941),\n", - " ('法', 804770),\n", - " ('南', 804650),\n", - " ('学', 798939),\n", - " ('市', 791625),\n", - " ('里', 778211),\n", - " ('會', 777237),\n", - " ('同', 771726),\n", - " ('个', 763306),\n", - " ('多', 761161),\n", - " ('可', 760766),\n", - " ('政', 753960),\n", - " ('子', 747029),\n", - " ('自', 721717),\n", - " ('任', 719743),\n", - " ('特', 708178),\n", - " ('北', 705727),\n", - " ('文', 703854),\n", - " ('高', 684065),\n", - " ('加', 682447),\n", - " ('德', 679369),\n", - " ('海', 676572),\n", - " ('最', 676398),\n", - " ('新', 673906),\n", - " ('與', 670787),\n", - " ('克', 665932),\n", - " ('u', 665502),\n", - " ('后', 658911),\n", - " ('也', 657383),\n", - " ('代', 657013),\n", - " ('前', 656735),\n", - " ('等', 654057),\n", - " ('利', 651952),\n", - " ('平', 650907),\n", - " ('所', 646943),\n", - " ('學', 637814),\n", - " ('三', 637252),\n", - " ('h', 633489),\n", - " ('道', 627641),\n", - " ('度', 625007),\n", - " ('個', 624395),\n", - " ('因', 622385),\n", - " ('此', 620558),\n", - " ('面', 620391),\n", - " ('建', 618519),\n", - " ('得', 617685),\n", - " ('美', 610494),\n", - " ('物', 610144),\n", - " ('要', 602652),\n", - " ('教', 587490),\n", - " ('山', 585261),\n", - " ('时', 578547),\n", - " ('至', 575987),\n", - " ('事', 574421),\n", - " ('小', 573926),\n", - " ('路', 571932),\n", - " ('立', 571585),\n", - " ('拉', 568622),\n", - " ('口', 563811),\n", - " ('c', 561702),\n", - " ('会', 560024),\n", - " ('m', 559579),\n", - " ('尔', 559186),\n", - " ('入', 558897),\n", - " ('能', 554237),\n", - " ('二', 550386),\n", - " ('世', 550038),\n", - " ('發', 542170),\n", - " ('理', 539290),\n", - " ('d', 536237),\n", - " ('天', 532117),\n", - " ('工', 532015),\n", - " ('外', 530203),\n", - " ('合', 529451),\n", - " ('区', 528732),\n", - " ('表', 527233),\n", - " ('州', 526788),\n", - " ('S', 526181),\n", - " ('十', 523368),\n", - " ('定', 523324),\n", - " ('球', 520041),\n", - " ('A', 518904),\n", - " ('但', 516365),\n", - " ('期', 515809),\n", - " ('區', 514889),\n", - " ('化', 514021),\n", - " ('台', 512971),\n", - " ('科', 507284),\n", - " ('使', 506784),\n", - " ('全', 502586),\n", - " ('次', 502296),\n", - " ('性', 489355),\n", - " ('通', 484795),\n", - " ('者', 477756),\n", - " ('共', 469358),\n", - " ('C', 466320),\n", - " ('式', 463738),\n", - " ('重', 460538),\n", - " ('都', 458267),\n", - " ('並', 447402),\n", - " ('力', 446857),\n", - " ('來', 444149),\n", - " ('发', 443728),\n", - " ('安', 442999),\n", - " ('士', 442969),\n", - " ('布', 440096),\n", - " ('軍', 433577),\n", - " ('军', 425462),\n", - " ('比', 423014),\n", - " ('王', 420613),\n", - " ('城', 419587),\n", - " ('基', 418242),\n", - " ('原', 418019),\n", - " ('相', 417754),\n", - " ('院', 416280),\n", - " ('站', 414110),\n", - " ('省', 412008),\n", - " ('明', 410219),\n", - " ('動', 406572),\n", - " ('目', 406002),\n", - " ('爾', 405283),\n", - " ('長', 403563),\n", - " ('这', 401324),\n", - " ('水', 400762),\n", - " ('司', 397010),\n", - " ('對', 393805),\n", - " ('正', 393688),\n", - " ('系', 392783),\n", - " ('治', 392555),\n", - " ('金', 392494),\n", - " ('英', 390793),\n", - " ('格', 390268),\n", - " ('米', 389906),\n", - " ('尼', 388701),\n", - " ('這', 388498),\n", - " ('府', 386464),\n", - " ('開', 385362),\n", - " ('M', 384393),\n", - " ('长', 382519),\n", - " ('与', 380872),\n", - " ('制', 378642),\n", - " ('四', 375847),\n", - " ('如', 372625),\n", - " ('或', 371731),\n", - " ('電', 371551),\n", - " ('列', 369194),\n", - " ('巴', 368453),\n", - " ('元', 367562),\n", - " ('星', 365070),\n", - " ('T', 364514),\n", - " ('港', 364113),\n", - " ('影', 362517),\n", - " ('起', 362407),\n", - " ('員', 360173),\n", - " ('現', 359882),\n", - " ('g', 359127),\n", - " ('亞', 358195),\n", - " ('对', 355573),\n", - " ('經', 354948),\n", - " ('首', 354483),\n", - " ('常', 354237),\n", - " ('東', 353661),\n", - " ('女', 353631),\n", - " ('提', 350015),\n", - " ('阿', 346882),\n", - " ('進', 344636),\n", - " ('改', 344086),\n", - " ('當', 341120),\n", - " ('来', 340798),\n", - " ('演', 339729),\n", - " ('林', 336815),\n", - " ('間', 335712),\n", - " ('B', 334835),\n", - " ('量', 334536),\n", - " ('并', 334134),\n", - " ('心', 332839),\n", - " ('過', 332363),\n", - " ('然', 330960),\n", - " ('向', 328494),\n", - " ('就', 328304),\n", - " ('东', 328088),\n", - " ('P', 326436),\n", - " ('始', 324881),\n", - " ('开', 323850),\n", - " ('接', 323779),\n", - " ('戰', 323477),\n", - " ('賽', 322898),\n", - " ('D', 322726),\n", - " ('界', 318638),\n", - " ('p', 318016),\n", - " ('體', 317834),\n", - " ('y', 314733),\n", - " ('曾', 313637),\n", - " ('动', 310998),\n", - " ('意', 310932),\n", - " ('將', 310562),\n", - " ('管', 307647),\n", - " ('機', 307041),\n", - " ('河', 307009),\n", - " ('車', 306644),\n", - " ('卡', 304322),\n", - " ('內', 304174),\n", - " ('镇', 303953),\n", - " ('手', 303526),\n", - " ('受', 302250),\n", - " ('员', 302057),\n", - " ('委', 300752),\n", - " ('程', 300520),\n", - " ('形', 299760),\n", - " ('交', 297947),\n", - " ('指', 296897),\n", - " ('色', 296783),\n", - " ('保', 296578),\n", - " ('場', 296368),\n", - " ('包', 295042),\n", - " ('I', 294062),\n", - " ('流', 292640),\n", - " ('型', 290557),\n", - " ('亚', 290181),\n", - " ('机', 290172),\n", - " ('已', 289800),\n", - " ('f', 288668),\n", - " ('香', 286281),\n", - " ('音', 285585),\n", - " ('些', 285145),\n", - " ('直', 283736),\n", - " ('内', 283452),\n", - " ('馬', 280524),\n", - " ('总', 279126),\n", - " ('太', 275584),\n", - " ('经', 274329),\n", - " ('集', 274113),\n", - " ('史', 273344),\n", - " ('身', 273032),\n", - " ('稱', 271301),\n", - " ('夫', 270599),\n", - " ('線', 270406),\n", - " ('过', 270359),\n", - " ('隊', 270205),\n", - " ('进', 268462),\n", - " ('五', 267428),\n", - " ('字', 267046),\n", - " ('該', 264882),\n", - " ('校', 264239),\n", - " ('品', 264130),\n", - " ('業', 263128),\n", - " ('县', 262995),\n", - " ('空', 262695),\n", - " ('现', 262407),\n", - " ('朝', 262041),\n", - " ('E', 260395),\n", - " ('种', 259792),\n", - " ('车', 258938),\n", - " ('更', 258819),\n", - " ('江', 257888),\n", - " ('R', 256996),\n", - " ('種', 256163),\n", - " ('展', 255724),\n", - " ('古', 255601),\n", - " ('社', 255211),\n", - " ('该', 254463),\n", - " ('电', 254124),\n", - " ('示', 252980),\n", - " ('间', 252767),\n", - " ('兩', 252402),\n", - " ('設', 252154),\n", - " ('马', 250001),\n", - " ('灣', 249947),\n", - " ('各', 247403),\n", - " ('持', 247021),\n", - " ('业', 246672),\n", - " ('洲', 245773),\n", - " ('著', 244952),\n", - " ('N', 244899),\n", - " ('官', 242881),\n", - " ('将', 241701),\n", - " ('回', 241313),\n", - " ('數', 241246),\n", - " ('反', 241211),\n", - " ('命', 240106),\n", - " ('体', 237646),\n", - " ('k', 237388),\n", - " ('战', 237369),\n", - " ('G', 234625),\n", - " ('京', 234164),\n", - " ('L', 233805),\n", - " ('活', 233002),\n", - " ('取', 232975),\n", - " ('羅', 231230),\n", - " ('光', 231063),\n", - " ('造', 230981),\n", - " ('達', 230526),\n", - " ('信', 227626),\n", - " ('選', 227508),\n", - " ('b', 226561),\n", - " ('石', 226459),\n", - " ('少', 225706),\n", - " ('只', 223992),\n", - " ('支', 223568),\n", - " ('村', 223319),\n", - " ('近', 223234),\n", - " ('族', 223081),\n", - " ('神', 222888),\n", - " ('清', 222725),\n", - " ('称', 222196),\n", - " ('曲', 221024),\n", - " ('普', 220896),\n", - " ('局', 220552),\n", - " ('当', 220275),\n", - " ('又', 219805),\n", - " ('属', 219701),\n", - " ('解', 219554),\n", - " ('約', 219466),\n", - " ('它', 219168),\n", - " ('设', 219067),\n", - " ('罗', 218750),\n", - " ('H', 218102),\n", - " ('情', 217894),\n", - " ('聯', 217887),\n", - " ('数', 217489),\n", - " ('蘭', 217217),\n", - " ('非', 216450),\n", - " ('队', 215761),\n", - " ('每', 215661),\n", - " ('研', 213291),\n", - " ('O', 212849),\n", - " ('運', 212729),\n", - " ('波', 212046),\n", - " ('號', 211509),\n", - " ('件', 210305),\n", - " ('角', 209998),\n", - " ('放', 209383),\n", - " ('縣', 208835),\n", - " ('联', 208570),\n", - " ('武', 207668),\n", - " ('片', 207506),\n", - " ('則', 206183),\n", - " ('關', 205509),\n", - " ('密', 205338),\n", - " ('總', 204923),\n", - " ('統', 204720),\n", - " ('版', 204203),\n", - " ('伊', 204184),\n", - " ('商', 203050),\n", - " ('即', 202966),\n", - " ('她', 202386),\n", - " ('華', 201388),\n", - " ('樂', 200087),\n", - " ('們', 199599),\n", - " ('李', 199272),\n", - " ('先', 199023),\n", - " ('計', 198509),\n", - " ('括', 197604),\n", - " ('两', 197523),\n", - " ('收', 196319),\n", - " ('F', 196115),\n", - " ('屬', 196024),\n", - " ('無', 195249),\n", - " ('器', 194519),\n", - " ('运', 194433),\n", - " ('往', 193706),\n", - " ('知', 193689),\n", - " ('场', 193256),\n", - " ('去', 193095),\n", - " ('座', 193025),\n", - " ('組', 192983),\n", - " ('白', 192869),\n", - " ('产', 191056),\n", - " ('未', 190989),\n", - " ('果', 190398),\n", - " ('再', 189266),\n", - " ('根', 188412),\n", - " ('歌', 188362),\n", - " ('視', 188245),\n", - " ('初', 188093),\n", - " ('赛', 186268),\n", - " ('华', 186221),\n", - " ('足', 186158),\n", - " ('福', 185682),\n", - " ('六', 185381),\n", - " ('达', 185158),\n", - " ('组', 184960),\n", - " ('究', 184003),\n", - " ('v', 183837),\n", - " ('统', 183054),\n", - " ('且', 182916),\n", - " ('語', 182338),\n", - " ('今', 182157),\n", - " ('线', 182010),\n", - " ('级', 181119),\n", - " ('兵', 181058),\n", - " ('从', 180973),\n", - " ('引', 180509),\n", - " ('派', 180499),\n", - " ('從', 179255),\n", - " ('議', 179047),\n", - " ('居', 178918),\n", - " ('语', 178804),\n", - " ('亦', 178605),\n", - " ('完', 177535),\n", - " ('打', 177422),\n", - " ('帝', 176902),\n", - " ('域', 176620),\n", - " ('w', 176139),\n", - " ('们', 175906),\n", - " ('号', 175714),\n", - " ('參', 174989),\n", - " ('季', 174363),\n", - " ('宗', 174134),\n", - " ('塔', 174114),\n", - " ('关', 173075),\n", - " ('除', 171436),\n", - " ('播', 170322),\n", - " ('威', 169979),\n", - " ('书', 168981),\n", - " ('務', 168894),\n", - " ('花', 168407),\n", - " ('很', 168249),\n", - " ('團', 168210),\n", - " ('段', 168011),\n", - " ('令', 167846),\n", - " ('八', 167706),\n", - " ('服', 167562),\n", - " ('說', 167392),\n", - " ('實', 167336),\n", - " ('计', 167303),\n", - " ('島', 166734),\n", - " ('育', 166535),\n", - " ('義', 166489),\n", - " ('門', 165434),\n", - " ('速', 164945),\n", - " ('兰', 164821),\n", - " ('广', 164210),\n", - " ('言', 164091),\n", - " ('產', 163195),\n", - " ('結', 163155),\n", - " ('约', 162440),\n", - " ('土', 161667),\n", - " ('功', 161447),\n", - " ('另', 160843),\n", - " ('死', 160580),\n", - " ('置', 160481),\n", - " ('父', 159923),\n", - " ('修', 159640),\n", - " ('希', 159417),\n", - " ('好', 158431),\n", - " ('存', 158262),\n", - " ('周', 158136),\n", - " ('客', 158084),\n", - " ('類', 157931),\n", - " ('傳', 157902),\n", - " ('推', 157366),\n", - " ('超', 156866),\n", - " ('雷', 156703),\n", - " ('率', 155829),\n", - " ('宣', 155765),\n", - " ('V', 155153),\n", - " ('處', 154921),\n", - " ('洛', 154404),\n", - " ('查', 153766),\n", - " ('吉', 153490),\n", - " ('我', 153341),\n", - " ('供', 153324),\n", - " ('九', 152986),\n", - " ('投', 152576),\n", - " ('具', 152376),\n", - " ('哈', 152301),\n", - " ('班', 151926),\n", - " ('街', 151717),\n", - " ('源', 151678),\n", - " ('認', 151616),\n", - " ('单', 151615),\n", - " ('实', 151390),\n", - " ('單', 151304),\n", - " ('博', 150101),\n", - " ('青', 150046),\n", - " ('党', 149938),\n", - " ('獲', 149856),\n", - " ('K', 149744),\n", - " ('瓦', 149325),\n", - " ('母', 148890),\n", - " ('百', 148184),\n", - " ('湖', 147901),\n", - " ('务', 147810),\n", - " ('書', 147788),\n", - " ('求', 147572),\n", - " ('田', 147532),\n", - " ('維', 146892),\n", - " ('印', 146693),\n", - " ('男', 146232),\n", - " ('维', 145957),\n", - " ('廣', 145742),\n", - " ('張', 145614),\n", - " ('伯', 145295),\n", - " ('火', 144739),\n", - " ('攻', 144671),\n", - " ('己', 144525),\n", - " ('故', 144506),\n", - " ('还', 144361),\n", - " ('落', 144305),\n", - " ('例', 144222),\n", - " ('舉', 144188),\n", - " ('案', 143967),\n", - " ('z', 143613),\n", - " ('那', 143562),\n", - " ('勒', 143562),\n", - " ('別', 143269),\n", - " ('皇', 142883),\n", - " ('助', 142780),\n", - " ('沙', 142772),\n", - " ('份', 142710),\n", - " ('航', 142510),\n", - " ('义', 142469),\n", - " ('W', 142165),\n", - " ('帶', 141966),\n", - " ('参', 141832),\n", - " ('议', 141443),\n", - " ('变', 141044),\n", - " ('图', 140532),\n", - " ('臺', 140228),\n", - " ('早', 140153),\n", - " ('應', 139981),\n", - " ('技', 139788),\n", - " ('資', 139704),\n", - " ('致', 139268),\n", - " ('導', 139221),\n", - " ('龍', 139102),\n", - " ('整', 138777),\n", - " ('容', 138509),\n", - " ('增', 137959),\n", - " ('副', 137825),\n", - " ('模', 137700),\n", - " ('點', 137387),\n", - " ('萬', 137295),\n", - " ('半', 137221),\n", - " ('登', 137071),\n", - " ('劇', 136654),\n", - " ('專', 136511),\n", - " ('均', 136496),\n", - " ('群', 135618),\n", - " ('七', 135403),\n", - " ('积', 134739),\n", - " ('川', 134691),\n", - " ('说', 134684),\n", - " ('擊', 134586),\n", - " ('堂', 134532),\n", - " ('铁', 134373),\n", - " ('師', 134069),\n", - " ('选', 134038),\n", - " ('越', 133820),\n", - " ('万', 133369),\n", - " ('級', 133094),\n", - " ('游', 132824),\n", - " ('失', 132291),\n", - " ('处', 132117),\n", - " ('门', 131694),\n", - " ('變', 131688),\n", - " ('点', 131461),\n", - " ('告', 130988),\n", - " ('易', 130770),\n", - " ('食', 130723),\n", - " ('領', 130618),\n", - " ('乡', 130599),\n", - " ('氏', 130340),\n", - " ('奥', 129907),\n", - " ('辖', 129634),\n", - " ('深', 129567),\n", - " ('黨', 129285),\n", - " ('風', 129159),\n", - " ('境', 129148),\n", - " ('领', 129075),\n", - " ('结', 128312),\n", - " ('紀', 128223),\n", - " ('室', 127991),\n", - " ('传', 127796),\n", - " ('黑', 127771),\n", - " ('塞', 127038),\n", - " ('射', 127006),\n", - " ('排', 126751),\n", - " ('升', 126543),\n", - " ('应', 126535),\n", - " ('需', 126341),\n", - " ('导', 126328),\n", - " ('限', 125959),\n", - " ('甲', 125958),\n", - " ('頭', 125448),\n", - " ('製', 125108),\n", - " ('团', 125104),\n", - " ('節', 125019),\n", - " ('乐', 124927),\n", - " ('連', 124841),\n", - " ('轉', 124586),\n", - " ('象', 124562),\n", - " ('像', 124360),\n", - " ('老', 124289),\n", - " ('无', 123544),\n", - " ('央', 123394),\n", - " ('真', 123086),\n", - " ('沒', 123072),\n", - " ('畫', 122604),\n", - " ('何', 122439),\n", - " ('U', 122366),\n", - " ('苏', 122365),\n", - " ('才', 122224),\n", - " ('类', 122219),\n", - " ('托', 121767),\n", - " ('仍', 121736),\n", - " ('获', 121698),\n", - " ('木', 121590),\n", - " ('便', 120778),\n", - " ('据', 120454),\n", - " ('洋', 120050),\n", - " ('鐵', 120011),\n", - " ('步', 119954),\n", - " ('牙', 119825),\n", - " ('森', 119639),\n", - " ('條', 119443),\n", - " ('報', 119391),\n", - " ('盟', 119374),\n", - " ('際', 119188),\n", - " ('奇', 119154),\n", - " ('氣', 118708),\n", - " ('權', 118602),\n", - " ('考', 118330),\n", - " ('席', 118307),\n", - " ('想', 118279),\n", - " ('魚', 118272),\n", - " ('積', 118008),\n", - " ('施', 117372),\n", - " ('感', 116760),\n", - " ('配', 116591),\n", - " ('哥', 116487),\n", - " ('J', 116467),\n", - " ('標', 116329),\n", - " ('決', 116227),\n", - " ('低', 116133),\n", - " ('奧', 115885),\n", - " ('論', 115797),\n", - " ('防', 115721),\n", - " ('纳', 115188),\n", - " ('师', 114762),\n", - " ('住', 114045),\n", - " ('效', 113893),\n", - " ('留', 113782),\n", - " ('止', 113746),\n", - " ('术', 113445),\n", - " ('律', 113314),\n", - " ('认', 113285),\n", - " ('姆', 113277),\n", - " ('把', 113095),\n", - " ('历', 112260),\n", - " ('底', 112207),\n", - " ('记', 111883),\n", - " ('還', 111668),\n", - " ('術', 111615),\n", - " ('埃', 111571),\n", - " ('泰', 111285),\n", - " ('親', 111130),\n", - " ('料', 111078),\n", - " ('曼', 110629),\n", - " ('病', 110519),\n", - " ('離', 110513),\n", - " ('冠', 110329),\n", - " ('佛', 110326),\n", - " ('视', 110308),\n", - " ('风', 110245),\n", - " ('纪', 110175),\n", - " ('阳', 109809),\n", - " ('千', 109448),\n", - " ('恩', 109346),\n", - " ('警', 109119),\n", - " ('據', 108854),\n", - " ('志', 108645),\n", - " ('降', 108596),\n", - " ('景', 108453),\n", - " ('看', 108416),\n", - " ('蒙', 108306),\n", - " ('报', 108276),\n", - " ('息', 107990),\n", - " ('俄', 107958),\n", - " ('強', 107936),\n", - " ('迪', 107648),\n", - " ('祖', 107637),\n", - " ('創', 107082),\n", - " ('察', 107009),\n", - " ('唱', 106912),\n", - " ('移', 106740),\n", - " ('納', 106718),\n", - " ('素', 106534),\n", - " ('圖', 106375),\n", - " ('着', 106345),\n", - " ('批', 105772),\n", - " ('张', 105675),\n", - " ('康', 105597),\n", - " ('革', 105447),\n", - " ('負', 105360),\n", - " ('職', 105179),\n", - " ('则', 104399),\n", - " ('愛', 104300),\n", - " ('消', 104273),\n", - " ('算', 104202),\n", - " ('承', 104198),\n", - " ('觀', 104137),\n", - " ('索', 104088),\n", - " ('較', 104086),\n", - " ('带', 104018),\n", - " ('授', 103682),\n", - " ('典', 103668),\n", - " ('_', 103620),\n", - " ('控', 103423),\n", - " ('遊', 103323),\n", - " ('念', 103024),\n", - " ('續', 102958),\n", - " ('藏', 102923),\n", - " ('汉', 101961),\n", - " ('专', 101882),\n", - " ('獎', 101672),\n", - " ('思', 101588),\n", - " ('許', 101259),\n", - " ('标', 100979),\n", - " ('記', 100883),\n", - " ('票', 100548),\n", - " ('責', 100538),\n", - " ('守', 100446),\n", - " ('做', 100439),\n", - " ('友', 100220),\n", - " ('岛', 99854),\n", - " ('论', 99377),\n", - " ('没', 98977),\n", - " ('見', 98953),\n", - " ('精', 98863),\n", - " ('际', 98797),\n", - " ('别', 98091),\n", - " ('瑞', 97896),\n", - " ('裝', 97284),\n", - " ('督', 96375),\n", - " ('附', 96230),\n", - " ('望', 95809),\n", - " ('題', 95646),\n", - " ('陸', 95179),\n", - " ('依', 95157),\n", - " ('錄', 94822),\n", - " ('照', 94775),\n", - " ('房', 94743),\n", - " ('夏', 94681),\n", - " ('質', 94661),\n", - " ('述', 94433),\n", - " ('办', 94390),\n", - " ('转', 94371),\n", - " ('園', 94359),\n", - " ('毛', 94208),\n", - " ('草', 93824),\n", - " ('拔', 93588),\n", - " ('章', 93577),\n", - " ('圣', 93412),\n", - " ('抗', 93149),\n", - " ('編', 93136),\n", - " ('歷', 93082),\n", - " ('击', 92879),\n", - " ('资', 92821),\n", - " ('調', 92575),\n", - " ('权', 92382),\n", - " ('網', 92357),\n", - " ('条', 91960),\n", - " ('职', 91787),\n", - " ('节', 91659),\n", - " ('气', 91538),\n", - " ('封', 91341),\n", - " ('决', 91294),\n", - " ('富', 91278),\n", - " ('退', 91039),\n", - " ('般', 90979),\n", - " ('停', 90904),\n", - " ('澳', 90811),\n", - " ('左', 90752),\n", - " ('蘇', 90677),\n", - " ('永', 90655),\n", - " ('極', 90650),\n", - " ('复', 90497),\n", - " ('辦', 90486),\n", - " ('丹', 90361),\n", - " ('唐', 90129),\n", - " ('x', 89981),\n", - " ('陆', 89933),\n", - " ('試', 89718),\n", - " ('堡', 89589),\n", - " ('戲', 89325),\n", - " ('爭', 89189),\n", - " ('丁', 89186),\n", - " ('陳', 89112),\n", - " ('牌', 89029),\n", - " ('构', 88981),\n", - " ('值', 88924),\n", - " ('拿', 88753),\n", - " ('快', 88628),\n", - " ('終', 88618),\n", - " ('隨', 88511),\n", - " ('船', 88483),\n", - " ('歐', 88391),\n", - " ('營', 88363),\n", - " ('候', 88347),\n", - " ('聖', 88231),\n", - " ('坦', 88094),\n", - " ('破', 87862),\n", - " ('隆', 87850),\n", - " ('梅', 87461),\n", - " ('寺', 87390),\n", - " ('給', 87337),\n", - " ('邦', 87005),\n", - " ('植', 86765),\n", - " ('郡', 86725),\n", - " ('鎮', 86584),\n", - " ('萨', 86424),\n", - " ('擔', 86366),\n", - " ('莱', 85980),\n", - " ('莫', 85573),\n", - " ('協', 85570),\n", - " ('含', 85530),\n", - " ('调', 85358),\n", - " ('問', 85127),\n", - " ('黃', 84929),\n", - " ('似', 84723),\n", - " ('突', 84699),\n", - " ('皮', 84610),\n", - " ('创', 84203),\n", - " ('必', 83904),\n", - " ('興', 83729),\n", - " ('尚', 83459),\n", - " ('准', 83283),\n", - " ('旅', 83109),\n", - " ('强', 83039),\n", - " ('眾', 83019),\n", - " ('蒂', 82986),\n", - " ('走', 82714),\n", - " ('項', 82683),\n", - " ('举', 82639),\n", - " ('离', 82524),\n", - " ('注', 82492),\n", - " ('剧', 82482),\n", - " ('久', 82239),\n", - " ('争', 82222),\n", - " ('嘉', 81692),\n", - " ('岸', 81671),\n", - " ('济', 81487),\n", - " ('樣', 81427),\n", - " ('藝', 81322),\n", - " ('划', 81297),\n", - " ('切', 81164),\n", - " ('右', 80824),\n", - " ('飛', 80759),\n", - " ('兒', 80726),\n", - " ('头', 80687),\n", - " ('店', 80650),\n", - " ('延', 80407),\n", - " ('朗', 80300),\n", - " ('轄', 80261),\n", - " ('网', 80054),\n", - " ('漢', 79838),\n", - " ('鄉', 79658),\n", - " ('随', 79617),\n", - " ('规', 79594),\n", - " ('野', 79035),\n", - " ('架', 79007),\n", - " ('弟', 78994),\n", - " ('熱', 78903),\n", - " ('邊', 78863),\n", - " ('龙', 78739),\n", - " ('倫', 78712),\n", - " ('春', 78513),\n", - " ('装', 78421),\n", - " ('连', 78387),\n", - " ('核', 78315),\n", - " ('售', 78277),\n", - " ('给', 78244),\n", - " ('衛', 78045),\n", - " ('护', 77913),\n", - " ('館', 77769),\n", - " ('亡', 77693),\n", - " ('規', 77647),\n", - " ('协', 77539),\n", - " ('质', 77406),\n", - " ('油', 77375),\n", - " ('编', 77293),\n", - " ('環', 77090),\n", - " ('艾', 76833),\n", - " ('層', 76828),\n", - " ('備', 76649),\n", - " ('讓', 76635),\n", - " ('構', 76365),\n", - " ('佳', 76056),\n", - " ('弗', 76030),\n", - " ('占', 75999),\n", - " ('判', 75750),\n", - " ('黄', 75658),\n", - " ('雄', 75547),\n", - " ('X', 75528),\n", - " ('繼', 75474),\n", - " ('筑', 75385),\n", - " ('端', 75135),\n", - " ('鲁', 75054),\n", - " ('短', 74940),\n", - " ('股', 74936),\n", - " ('题', 74471),\n", - " ('圍', 74137),\n", - " ('耶', 74110),\n", - " ('卫', 73704),\n", - " ('许', 73696),\n", - " ('遭', 73526),\n", - " ('戏', 73441),\n", - " ('輯', 73372),\n", - " ('问', 73239),\n", - " ('護', 73075),\n", - " ('宁', 72970),\n", - " ('策', 72800),\n", - " ('薩', 72779),\n", - " ('雙', 72740),\n", - " ('样', 72679),\n", - " ('逐', 72434),\n", - " ('松', 72320),\n", - " ('赫', 72312),\n", - " ('担', 72226),\n", - " ('馆', 72177),\n", - " ('評', 72162),\n", - " ('观', 71822),\n", - " ('帕', 71758),\n", - " ('距', 71703),\n", - " ('役', 71667),\n", - " ('聲', 71563),\n", - " ('摩', 71501),\n", - " ...]" - ] - }, - "execution_count": 155, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 162, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'all_wiki_content' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mdel\u001b[0m \u001b[0mall_wiki_content\u001b[0m \u001b[1;31m# 内存不够\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mgram_length\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mtwo_gram_counts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCounter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mall_character\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0mgram_length\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mall_character\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mgram_length\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mNameError\u001b[0m: name 'all_wiki_content' is not defined" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 163, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Counter({'新华': 135490,\n", - " '华社': 129104,\n", - " '社照': 45003,\n", - " '照片': 46712,\n", - " '片东': 235,\n", - " '东莞': 1543,\n", - " '莞广': 195,\n", - " '广东': 5182,\n", - " '东2': 922,\n", - " '20': 123427,\n", - " '01': 102583,\n", - " '17': 81801,\n", - " '7年': 59051,\n", - " '年4': 21904,\n", - " '4月': 51236,\n", - " '月7': 3777,\n", - " '7日': 12919,\n", - " '日n': 48360,\n", - " 'n体': 18616,\n", - " '体育': 27169,\n", - " '育9': 438,\n", - " '9篮': 75,\n", - " '篮球': 5154,\n", - " '球C': 416,\n", - " 'CB': 1003,\n", - " 'BA': 5552,\n", - " 'A总': 939,\n", - " '总决': 1968,\n", - " '决赛': 16108,\n", - " '赛第': 6484,\n", - " '第四': 2788,\n", - " '四场': 474,\n", - " '场广': 176,\n", - " '东对': 83,\n", - " '对阵': 2153,\n", - " '阵新': 88,\n", - " '新疆': 4703,\n", - " '疆n': 134,\n", - " 'n4': 23273,\n", - " '日广': 727,\n", - " '东东': 634,\n", - " '莞银': 555,\n", - " '银行': 9538,\n", - " '行队': 935,\n", - " '队球': 10498,\n", - " '球员': 13102,\n", - " '员易': 71,\n", - " '易建': 110,\n", - " '建联': 161,\n", - " '联在': 98,\n", - " '在比': 9666,\n", - " '比赛': 29313,\n", - " '赛中': 26877,\n", - " '中扣': 112,\n", - " '扣篮': 118,\n", - " '篮n': 720,\n", - " 'n当': 25000,\n", - " '当日': 24291,\n", - " '日在': 39495,\n", - " '在2': 9127,\n", - " '16': 25070,\n", - " '62': 7886,\n", - " '7赛': 7121,\n", - " '赛季': 9782,\n", - " '季中': 1411,\n", - " '中国': 77776,\n", - " '国男': 1192,\n", - " '男子': 5085,\n", - " '子篮': 421,\n", - " '球职': 422,\n", - " '职业': 3435,\n", - " '业联': 791,\n", - " '联赛': 18165,\n", - " '赛C': 429,\n", - " '场比': 2857,\n", - " '中广': 562,\n", - " '队主': 5662,\n", - " '主场': 5943,\n", - " '场迎': 386,\n", - " '迎战': 618,\n", - " '战新': 164,\n", - " '疆喀': 764,\n", - " '喀什': 1017,\n", - " '什古': 620,\n", - " '古城': 1428,\n", - " '城队': 1579,\n", - " '队n': 7145,\n", - " 'n新': 78433,\n", - " '社记': 27920,\n", - " '记者': 56946,\n", - " '者孟': 238,\n", - " '孟永': 121,\n", - " '永民': 122,\n", - " '民摄': 220,\n", - " '摄n': 11579,\n", - " '社北': 5509,\n", - " '北京': 27639,\n", - " '京4': 1073,\n", - " '4月': 7783,\n", - " '月1': 6440,\n", - " '14': 4021,\n", - " '4日': 3202,\n", - " '日新': 3927,\n", - " '新媒': 2760,\n", - " '媒体': 11080,\n", - " '体专': 2203,\n", - " '专电': 2123,\n", - " '电记': 11810,\n", - " '者杨': 1405,\n", - " '杨烨': 18,\n", - " '烨作': 4,\n", - " '作为': 9679,\n", - " '为国': 1607,\n", - " '国民': 2752,\n", - " '民经': 220,\n", - " '经济': 25304,\n", - " '济的': 973,\n", - " '的重': 7007,\n", - " '重要': 14453,\n", - " '要支': 278,\n", - " '支柱': 351,\n", - " '柱央': 2,\n", - " '央企': 698,\n", - " '企一': 23,\n", - " '一季': 2660,\n", - " '季度': 3356,\n", - " '度交': 57,\n", - " '交上': 55,\n", - " '上了': 1557,\n", - " '了一': 8259,\n", - " '一份': 1723,\n", - " '份漂': 3,\n", - " '漂亮': 340,\n", - " '亮的': 370,\n", - " '的成': 3755,\n", - " '成绩': 3672,\n", - " '绩单': 148,\n", - " '单国': 11,\n", - " '国资': 662,\n", - " '资委': 322,\n", - " '委总': 32,\n", - " '总会': 325,\n", - " '会计': 404,\n", - " '计师': 2453,\n", - " '师沈': 17,\n", - " '沈莹': 26,\n", - " '莹在': 45,\n", - " '在1': 849,\n", - " '13': 4147,\n", - " '3日': 3456,\n", - " '日国': 1034,\n", - " '国新': 1874,\n", - " '新办': 233,\n", - " '办新': 88,\n", - " '新闻': 11645,\n", - " '闻发': 1950,\n", - " '发布': 10817,\n", - " '布会': 3455,\n", - " '会上': 5442,\n", - " '上表': 967,\n", - " '表示': 20215,\n", - " '示一': 618,\n", - " '度中': 340,\n", - " '中央': 7244,\n", - " '企业': 27381,\n", - " '业累': 52,\n", - " '累计': 1961,\n", - " '计实': 119,\n", - " '实现': 10625,\n", - " '现利': 112,\n", - " '利润': 1653,\n", - " '润总': 99,\n", - " '总额': 939,\n", - " '额3': 23,\n", - " '31': 2316,\n", - " '12': 6255,\n", - " '20': 24199,\n", - " '0亿': 1184,\n", - " '亿元': 8923,\n", - " '元同': 1121,\n", - " '同比': 3888,\n", - " '比增': 2439,\n", - " '增长': 10003,\n", - " '长2': 316,\n", - " '23': 2952,\n", - " '32': 1293,\n", - " '3月': 2125,\n", - " '月当': 95,\n", - " '当月': 207,\n", - " '月利': 25,\n", - " '额达': 428,\n", - " '达到': 4860,\n", - " '到历': 52,\n", - " '历史': 7278,\n", - " '史同': 48,\n", - " '同期': 1240,\n", - " '期最': 119,\n", - " '最好': 1240,\n", - " '好水': 68,\n", - " '水平': 5432,\n", - " '平1': 34,\n", - " '10': 11467,\n", - " '02': 2368,\n", - " '2家': 122,\n", - " '家中': 1399,\n", - " '业中': 830,\n", - " '中有': 1733,\n", - " '有9': 119,\n", - " '99': 1404,\n", - " '9家': 80,\n", - " '家盈': 11,\n", - " '盈利': 805,\n", - " '利8': 5,\n", - " '81': 602,\n", - " '1家': 102,\n", - " '家企': 1095,\n", - " '业增': 474,\n", - " '增利': 30,\n", - " '利4': 12,\n", - " '43': 826,\n", - " '3家': 112,\n", - " '业效': 87,\n", - " '效益': 820,\n", - " '益增': 153,\n", - " '增幅': 560,\n", - " '幅超': 190,\n", - " '超过': 6453,\n", - " '过1': 607,\n", - " '0一': 21,\n", - " '一些': 8372,\n", - " '些长': 22,\n", - " '长期': 3531,\n", - " '期亏': 23,\n", - " '亏损': 438,\n", - " '损的': 118,\n", - " '的企': 1465,\n", - " '业实': 372,\n", - " '现扭': 10,\n", - " '扭亏': 61,\n", - " '亏为': 40,\n", - " '为盈': 44,\n", - " '盈n': 24,\n", - " 'n值': 276,\n", - " '值得': 1745,\n", - " '得注': 396,\n", - " '注意': 2000,\n", - " '意的': 956,\n", - " '的是': 6371,\n", - " '是她': 369,\n", - " '她同': 61,\n", - " '同时': 10754,\n", - " '时透': 47,\n", - " '透露': 1413,\n", - " '露了': 194,\n", - " '了下': 257,\n", - " '下一': 3928,\n", - " '一步': 8608,\n", - " '步央': 4,\n", - " '企改': 193,\n", - " '改革': 10095,\n", - " '革重': 72,\n", - " '重点': 5668,\n", - " '点国': 79,\n", - " '委将': 90,\n", - " '将加': 535,\n", - " '加快': 3248,\n", - " '快推': 455,\n", - " '推动': 8002,\n", - " '动中': 1943,\n", - " '业战': 43,\n", - " '战略': 5892,\n", - " '略性': 391,\n", - " '性重': 44,\n", - " '重组': 796,\n", - " '组加': 16,\n", - " '快钢': 6,\n", - " '钢铁': 893,\n", - " '铁煤': 76,\n", - " '煤炭': 822,\n", - " '炭煤': 10,\n", - " '煤化': 59,\n", - " '化工': 799,\n", - " '工等': 204,\n", - " '等业': 146,\n", - " '业务': 5239,\n", - " '务的': 1911,\n", - " '的专': 1370,\n", - " '专业': 5770,\n", - " '业化': 1152,\n", - " '化整': 61,\n", - " '整合': 1203,\n", - " '合n': 311,\n", - " 'n一': 5965,\n", - " '现净': 97,\n", - " '净利': 648,\n", - " '润2': 7,\n", - " '22': 2948,\n", - " '26': 2599,\n", - " '64': 769,\n", - " '42': 1006,\n", - " '2亿': 333,\n", - " '65': 973,\n", - " '5其': 18,\n", - " '其中': 9548,\n", - " '中归': 5,\n", - " '归属': 179,\n", - " '属于': 1777,\n", - " '于母': 28,\n", - " '母公': 126,\n", - " '公司': 26427,\n", - " '司所': 76,\n", - " '所有': 4421,\n", - " '有者': 127,\n", - " '者的': 2395,\n", - " '的净': 163,\n", - " '润1': 10,\n", - " '27': 2077,\n", - " '7亿': 299,\n", - " '24': 2719,\n", - " '46': 745,\n", - " '6沈': 1,\n", - " '莹告': 6,\n", - " '告诉': 3787,\n", - " '诉记': 2063,\n", - " '者其': 136,\n", - " '中石': 438,\n", - " '石油': 1560,\n", - " '油石': 64,\n", - " '石化': 551,\n", - " '化钢': 15,\n", - " '铁有': 71,\n", - " '有色': 220,\n", - " '色煤': 9,\n", - " '炭等': 29,\n", - " '等传': 198,\n", - " '传统': 7639,\n", - " '统产': 223,\n", - " '产业': 14123,\n", - " '益逐': 10,\n", - " '逐步': 1891,\n", - " '步回': 101,\n", - " '回升': 609,\n", - " '升先': 12,\n", - " '先进': 2113,\n", - " '进制': 199,\n", - " '制造': 6122,\n", - " '造业': 1820,\n", - " '业医': 109,\n", - " '医药': 1460,\n", - " '药现': 15,\n", - " '现代': 3155,\n", - " '代服': 177,\n", - " '服务': 20067,\n", - " '务业': 917,\n", - " '业等': 654,\n", - " '等行': 461,\n", - " '行业': 7341,\n", - " '益贡': 7,\n", - " '贡献': 2324,\n", - " '献稳': 6,\n", - " '稳步': 608,\n", - " '步提': 641,\n", - " '提升': 6100,\n", - " '升n': 493,\n", - " 'n在': 7524,\n", - " '在她': 300,\n", - " '她看': 81,\n", - " '看来': 1415,\n", - " '来央': 42,\n", - " '企利': 16,\n", - " '润大': 15,\n", - " '大幅': 2134,\n", - " '幅回': 168,\n", - " '升一': 72,\n", - " '一是': 1406,\n", - " '是抓': 72,\n", - " '抓住': 762,\n", - " '住了': 478,\n", - " '了大': 1483,\n", - " '大宗': 542,\n", - " '宗商': 375,\n", - " '商品': 3353,\n", - " '品市': 225,\n", - " '市场': 22666,\n", - " '场回': 59,\n", - " '回调': 167,\n", - " '调的': 258,\n", - " '的有': 1715,\n", - " '有利': 1779,\n", - " '利机': 60,\n", - " '机遇': 1964,\n", - " '遇加': 36,\n", - " '加大': 2469,\n", - " '大生': 181,\n", - " '生产': 8724,\n", - " '产组': 32,\n", - " '组织': 12481,\n", - " '织力': 30,\n", - " '力度': 2151,\n", - " '业抢': 13,\n", - " '抢抓': 162,\n", - " '抓市': 6,\n", - " '场机': 157,\n", - " '遇在': 41,\n", - " '在价': 85,\n", - " '价格': 6823,\n", - " '格回': 47,\n", - " '升的': 375,\n", - " '的时': 5529,\n", - " '时候': 3767,\n", - " '候加': 5,\n", - " '大排': 48,\n", - " '排产': 6,\n", - " '产力': 158,\n", - " '度提': 249,\n", - " '提高': 6118,\n", - " '高生': 81,\n", - " '产负': 219,\n", - " '负荷': 146,\n", - " '荷抓': 2,\n", - " '住市': 18,\n", - " '场的': 3422,\n", - " '的窗': 157,\n", - " '窗口': 910,\n", - " '口期': 62,\n", - " '期提': 97,\n", - " '高产': 171,\n", - " '产销': 302,\n", - " '销量': 659,\n", - " '量二': 14,\n", - " '二是': 1134,\n", - " '是得': 85,\n", - " '得益': 438,\n", - " '益于': 536,\n", - " '于加': 458,\n", - " '大供': 22,\n", - " '供给': 1792,\n", - " '给侧': 1008,\n", - " '侧结': 710,\n", - " '结构': 3804,\n", - " '构性': 1021,\n", - " '性改': 809,\n", - " '革力': 32,\n", - " '度特': 36,\n", - " '特别': 4664,\n", - " '别是': 2253,\n", - " '是钢': 22,\n", - " '炭去': 37,\n", - " '去产': 345,\n", - " '产能': 1973,\n", - " '能工': 228,\n", - " '工作': 25311,\n", - " '作在': 506,\n", - " '在2': 1640,\n", - " '01': 15505,\n", - " '16': 7193,\n", - " '6年': 4210,\n", - " '年取': 53,\n", - " '取得': 4713,\n", - " '得了': 1941,\n", - " '了很': 1071,\n", - " '很好': 905,\n", - " '好的': 4474,\n", - " '的进': 1309,\n", - " '进展': 1459,\n", - " '展化': 11,\n", - " '化解': 545,\n", - " '解钢': 15,\n", - " '铁过': 16,\n", - " '过剩': 438,\n", - " '剩产': 143,\n", - " '能1': 30,\n", - " '19': 5692,\n", - " '9万': 239,\n", - " '万吨': 1402,\n", - " '吨化': 7,\n", - " '解煤': 22,\n", - " '炭过': 21,\n", - " '能3': 12,\n", - " '34': 932,\n", - " '49': 673,\n", - " '97': 882,\n", - " '7万': 380,\n", - " '吨均': 10,\n", - " '均超': 140,\n", - " '超额': 94,\n", - " '额完': 53,\n", - " '完成': 6661,\n", - " '成年': 907,\n", - " '年度': 2106,\n", - " '度任': 17,\n", - " '任务': 4218,\n", - " '务提': 190,\n", - " '高了': 646,\n", - " '了先': 56,\n", - " '进产': 173,\n", - " '能的': 910,\n", - " '的利': 884,\n", - " '利用': 5295,\n", - " '用效': 126,\n", - " '效率': 1430,\n", - " '率稳': 39,\n", - " '稳定': 4016,\n", - " '定了': 1408,\n", - " '了市': 335,\n", - " '场局': 12,\n", - " '局面': 893,\n", - " '面促': 40,\n", - " '促进': 5413,\n", - " '进了': 929,\n", - " '了行': 123,\n", - " '业健': 168,\n", - " '健康': 5468,\n", - " '康发': 585,\n", - " '发展': 36410,\n", - " '展n': 3577,\n", - " 'n改': 227,\n", - " '革积': 20,\n", - " '积极': 6711,\n", - " '极成': 82,\n", - " '成效': 986,\n", - " '效已': 11,\n", - " '已经': 10854,\n", - " '经显': 24,\n", - " '显现': 471,\n", - " '现特': 47,\n", - " '是积': 79,\n", - " '极开': 232,\n", - " '开展': 8895,\n", - " '展重': 101,\n", - " '组和': 219,\n", - " '和调': 171,\n", - " '调整': 3644,\n", - " '整工': 17,\n", - " '作提': 344,\n", - " '了存': 19,\n", - " '存量': 392,\n", - " '量资': 218,\n", - " '资源': 7018,\n", - " '源的': 629,\n", - " '的配': 352,\n", - " '配置': 1034,\n", - " '置效': 25,\n", - " '率也': 210,\n", - " '也是': 6373,\n", - " '是央': 35,\n", - " '润回': 10,\n", - " '升非': 14,\n", - " '非常': 4885,\n", - " '常重': 409,\n", - " '要的': 3469,\n", - " '的原': 1841,\n", - " '原因': 3356,\n", - " '因沈': 2,\n", - " '莹说': 77,\n", - " '说她': 298,\n", - " '她进': 23,\n", - " '进一': 7264,\n", - " '步表': 27,\n", - " '示去': 126,\n", - " '去年': 6315,\n", - " '年宝': 14,\n", - " '宝钢': 32,\n", - " '钢和': 13,\n", - " '和武': 204,\n", - " '武钢': 61,\n", - " '钢进': 2,\n", - " '进行': 23528,\n", - " '行重': 165,\n", - " '组成': 2705,\n", - " '成立': 4345,\n", - " '立了': 1774,\n", - " '了宝': 80,\n", - " '宝武': 17,\n", - " '武集': 15,\n", - " '集团': 7298,\n", - " '团新': 50,\n", - " '新集': 32,\n", - " '团成': 280,\n", - " '立以': 397,\n", - " '以后': 1155,\n", - " '后可': 536,\n", - " '可以': 11898,\n", - " '以减': 133,\n", - " '减少': 2940,\n", - " '少重': 28,\n", - " '重复': 431,\n", - " '复建': 67,\n", - " '建设': 18606,\n", - " '设同': 57,\n", - " '时也': 1494,\n", - " '也减': 18,\n", - " '少了': 427,\n", - " '了管': 46,\n", - " '管理': 12621,\n", - " '理成': 97,\n", - " '成本': 3543,\n", - " '本提': 97,\n", - " '了效': 24,\n", - " '率再': 14,\n", - " '再比': 34,\n", - " '比如': 1893,\n", - " '如煤': 7,\n", - " '炭行': 41,\n", - " '业去': 91,\n", - " '年调': 54,\n", - " '整力': 6,\n", - " '度也': 257,\n", - " '也很': 574,\n", - " '很大': 1591,\n", - " '大成': 350,\n", - " '了煤': 17,\n", - " '炭资': 22,\n", - " '源整': 89,\n", - " '合平': 31,\n", - " '平台': 9763,\n", - " '台已': 65,\n", - " '经取': 89,\n", - " '效n': 339,\n", - " 'n据': 5941,\n", - " '据了': 2169,\n", - " '了解': 7101,\n", - " '解2': 43,\n", - " '年尤': 22,\n", - " '尤其': 1835,\n", - " '其是': 1390,\n", - " '是下': 128,\n", - " '下半': 1073,\n", - " '半年': 1692,\n", - " '年以': 2810,\n", - " '以来': 7949,\n", - " '企间': 4,\n", - " '间重': 50,\n", - " '组整': 33,\n", - " '合案': 12,\n", - " '案例': 780,\n", - " '例纷': 2,\n", - " '纷至': 62,\n", - " '至沓': 62,\n", - " '沓来': 62,\n", - " '来继': 33,\n", - " '继7': 1,\n", - " '7月': 902,\n", - " '11': 5998,\n", - " '1日': 4770,\n", - " '日中': 3695,\n", - " '国港': 89,\n", - " '港中': 340,\n", - " '中旅': 53,\n", - " '旅集': 28,\n", - " '团公': 523,\n", - " '司与': 286,\n", - " '与中': 1989,\n", - " '国国': 4636,\n", - " '国旅': 374,\n", - " '团有': 459,\n", - " '有限': 5842,\n", - " '限公': 4634,\n", - " '司发': 257,\n", - " '布公': 321,\n", - " '公告': 2147,\n", - " '告称': 503,\n", - " '称实': 58,\n", - " '实施': 7568,\n", - " '施战': 20,\n", - " '略重': 42,\n", - " '组之': 11,\n", - " '之后': 4283,\n", - " '后7': 13,\n", - " '委罕': 2,\n", - " '罕见': 406,\n", - " '见做': 6,\n", - " '做加': 31,\n", - " '加法': 114,\n", - " '法组': 37,\n", - " '组建': 1027,\n", - " '建中': 322,\n", - " '国航': 1289,\n", - " '航空': 4126,\n", - " '空发': 80,\n", - " '发动': 1184,\n", - " '动机': 756,\n", - " '机集': 59,\n", - " '团7': 4,\n", - " '15': 5946,\n", - " '5日': 3773,\n", - " '中粮': 76,\n", - " '粮集': 21,\n", - " '国中': 1179,\n", - " '中纺': 6,\n", - " '纺集': 2,\n", - " '司实': 145,\n", - " '组8': 8,\n", - " '8月': 501,\n", - " '月2': 6302,\n", - " '2日': 3333,\n", - " '国建': 769,\n", - " '建筑': 3808,\n", - " '筑材': 61,\n", - " '材料': 2236,\n", - " '料集': 11,\n", - " '中材': 4,\n", - " '材集': 9,\n", - " '施重': 59,\n", - " '组9': 3,\n", - " '9月': 613,\n", - " '日宝': 21,\n", - " '钢集': 71,\n", - " '与武': 70,\n", - " '武汉': 4198,\n", - " '汉钢': 12,\n", - " '铁集': 84,\n", - " '组1': 32,\n", - " '1月': 1464,\n", - " '国储': 34,\n", - " '储备': 683,\n", - " '备粮': 16,\n", - " '粮管': 9,\n", - " '理总': 149,\n", - " '总公': 227,\n", - " '备棉': 2,\n", - " '棉管': 2,\n", - " '组除': 6,\n", - " '除此': 198,\n", - " '此之': 395,\n", - " '之外': 1141,\n", - " '外1': 29,\n", - " '0日': 3246,\n", - " '国恒': 9,\n", - " '恒天': 25,\n", - " '天与': 118,\n", - " '国机': 286,\n", - " '机械': 1369,\n", - " '械工': 75,\n", - " '工业': 5552,\n", - " '业集': 760,\n", - " '司签': 164,\n", - " '签署': 2247,\n", - " '署重': 11,\n", - " '组协': 7,\n", - " '协议': 4183,\n", - " '议中': 622,\n", - " '天整': 13,\n", - " '整体': 2324,\n", - " '体产': 203,\n", - " '产权': 1615,\n", - " '权将': 27,\n", - " '将无': 113,\n", - " '无偿': 134,\n", - " '偿划': 8,\n", - " '划转': 51,\n", - " '转进': 14,\n", - " '进入': 8197,\n", - " '入国': 444,\n", - " '团截': 3,\n", - " '截至': 3481,\n", - " '至目': 789,\n", - " '目前': 15296,\n", - " '前由': 130,\n", - " '由国': 468,\n", - " '委直': 7,\n", - " '直接': 3826,\n", - " '接监': 8,\n", - " '监管': 5116,\n", - " '管的': 407,\n", - " '的中': 6759,\n", - " '业数': 214,\n", - " '数量': 2598,\n", - " '量已': 283,\n", - " '经降': 37,\n", - " '降至': 539,\n", - " '至1': 891,\n", - " '家n': 847,\n", - " 'n沈': 111,\n", - " '莹坦': 2,\n", - " '坦言': 378,\n", - " '言下': 21,\n", - " '企经': 36,\n", - " '经营': 4337,\n", - " '营仍': 5,\n", - " '仍面': 109,\n", - " '面临': 2798,\n", - " '临着': 237,\n", - " '着很': 95,\n", - " '很多': 5336,\n", - " '多不': 222,\n", - " '不确': 565,\n", - " '确定': 2481,\n", - " '定和': 451,\n", - " '和不': 328,\n", - " '不稳': 291,\n", - " '定因': 139,\n", - " '因素': 2170,\n", - " '素下': 3,\n", - " '步效': 3,\n", - " '增速': 1786,\n", - " '速可': 46,\n", - " '可能': 9800,\n", - " '能有': 725,\n", - " '有所': 1861,\n", - " '所放': 80,\n", - " '放缓': 587,\n", - " '缓一': 12,\n", - " '是国': 1618,\n", - " '国际': 38923,\n", - " '际市': 394,\n", - " '场不': 297,\n", - " '定性': 962,\n", - " '性较': 157,\n", - " '较大': 1674,\n", - " '大国': 1087,\n", - " '际政': 53,\n", - " '政治': 5158,\n", - " '治动': 24,\n", - " '动荡': 240,\n", - " '荡加': 10,\n", - " '加剧': 405,\n", - " '剧贸': 2,\n", - " '贸易': 6135,\n", - " '易摩': 25,\n", - " '摩擦': 138,\n", - " '擦增': 3,\n", - " '增多': 501,\n", - " '多汇': 3,\n", - " '汇率': 1302,\n", - " '率大': 179,\n", - " '品价': 373,\n", - " '格等': 62,\n", - " '等波': 4,\n", - " '波动': 659,\n", - " '动加': 125,\n", - " '产成': 142,\n", - " '本和': 308,\n", - " '和融': 98,\n", - " '融资': 2982,\n", - " '资成': 179,\n", - " '本上': 584,\n", - " '上升': 2063,\n", - " '升压': 10,\n", - " '压力': 2576,\n", - " '力较': 190,\n", - " '大二': 95,\n", - " '是中': 3672,\n", - " '业解': 61,\n", - " '解决': 5182,\n", - " '决历': 8,\n", - " '史遗': 86,\n", - " '遗留': 106,\n", - " '留问': 68,\n", - " '问题': 16448,\n", - " '题化': 21,\n", - " '解过': 96,\n", - " '能处': 43,\n", - " '处置': 974,\n", - " '置僵': 21,\n", - " '僵尸': 102,\n", - " '尸企': 62,\n", - " '等方': 2750,\n", - " '方面': 12243,\n", - " '面改': 112,\n", - " '革成': 66,\n", - " '本支': 41,\n", - " '支出': 597,\n", - " '出压': 49,\n", - " '力增': 183,\n", - " '增大': 254,\n", - " '大三': 114,\n", - " '三是': 740,\n", - " '是受': 202,\n", - " '受电': 56,\n", - " '电煤': 42,\n", - " '煤价': 63,\n", - " '格上': 389,\n", - " '上涨': 3910,\n", - " '涨火': 2,\n", - " '火电': 105,\n", - " '电上': 21,\n", - " '上网': 349,\n", - " '网价': 4,\n", - " '格下': 303,\n", - " '下调': 553,\n", - " '调以': 70,\n", - " '以及': 9147,\n", - " '及市': 132,\n", - " '场化': 629,\n", - " '化直': 12,\n", - " '直供': 22,\n", - " '供电': 362,\n", - " '电增': 15,\n", - " '增加': 5690,\n", - " '加等': 46,\n", - " '等因': 445,\n", - " '素影': 206,\n", - " '影响': 8739,\n", - " '响煤': 9,\n", - " '煤电': 202,\n", - " '电企': 117,\n", - " '益大': 31,\n", - " '幅下': 368,\n", - " '下滑': 753,\n", - " '滑亏': 2,\n", - " '损加': 5,\n", - " '剧她': 4,\n", - " '她表': 169,\n", - " '示下': 131,\n", - " '步国': 45,\n", - " '委和': 189,\n", - " '和央': 63,\n", - " '企将': 19,\n", - " '将进': 1183,\n", - " '步深': 219,\n", - " '深入': 3101,\n", - " '入开': 173,\n", - " '展降': 11,\n", - " '降本': 49,\n", - " '本增': 154,\n", - " '增效': 256,\n", - " '效工': 52,\n", - " '作加': 237,\n", - " '大处': 73,\n", - " '业的': 5831,\n", - " '的工': 3826,\n", - " '作力': 112,\n", - " '度做': 43,\n", - " '做好': 2147,\n", - " '好化': 9,\n", - " '作同': 150,\n", - " '时做': 116,\n", - " '好压': 4,\n", - " '压缩': 279,\n", - " '缩管': 3,\n", - " '理层': 169,\n", - " '层级': 180,\n", - " '级减': 6,\n", - " '少法': 22,\n", - " '法人': 892,\n", - " '人单': 246,\n", - " '单位': 4512,\n", - " '位工': 151,\n", - " '作深': 108,\n", - " '深化': 2679,\n", - " '化体': 325,\n", - " '体制': 1551,\n", - " '制机': 392,\n", - " '机制': 4775,\n", - " '制改': 849,\n", - " '革努': 6,\n", - " '努力': 3855,\n", - " '力保': 224,\n", - " '保持': 3758,\n", - " '持好': 21,\n", - " '好稳': 34,\n", - " '稳中': 415,\n", - " '中向': 320,\n", - " '向好': 605,\n", - " '好态': 58,\n", - " '态势': 1057,\n", - " '势n': 749,\n", - " 'n她': 313,\n", - " '时表': 1025,\n", - " '国企': 2157,\n", - " '革将': 104,\n", - " '将提': 311,\n", - " '提速': 333,\n", - " '速进': 69,\n", - " '步落': 76,\n", - " '落实': 3869,\n", - " '实中': 374,\n", - " '业功': 19,\n", - " '功能': 3237,\n", - " '能界': 5,\n", - " '界定': 124,\n", - " '定与': 161,\n", - " '与分': 85,\n", - " '分类': 1462,\n", - " '类方': 19,\n", - " '方案': 3761,\n", - " '案及': 44,\n", - " '及配': 103,\n", - " '配套': 1169,\n", - " '套措': 31,\n", - " '措施': 3970,\n", - " '施深': 25,\n", - " '入推': 490,\n", - " '推进': 8791,\n", - " '进公': 107,\n", - " '司制': 108,\n", - " '制股': 9,\n", - " '股份': 2670,\n", - " '份制': 130,\n", - " '制和': 398,\n", - " '和混': 42,\n", - " '混合': 1305,\n", - " '合所': 194,\n", - " '有制': 289,\n", - " '革加': 92,\n", - " '快完': 142,\n", - " '完善': 3326,\n", - " '善创': 22,\n", - " '创新': 12627,\n", - " ...})" - ] - }, - "execution_count": 163, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "two_gram_counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Some More" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. 教学演示版本, 如果你想获得更好的结果,需要查阅更多资料,然后有很多小的点(stop words, smooth, OOV(out of vacabulary)); \n", - "2. 我们需要更多数据;\n", - "3. 数据也要保证高质量;" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n数学\\n\\n数学是利用符号语言研究數量、结构、变化以及空间等概念的一門学科,从某种角度看屬於形式科學的一種。數學透過抽象化和邏輯推理的使用,由計數、計算、量度和對物體形狀及運動的觀察而產生。數學家們拓展這些概念,為了公式化新的猜想以及從選定的公理及定義中建立起嚴謹推導出的定理。\\n\\n基礎數學的知識與運用總是個人與團體生活中不可或缺的一環。對數學基本概念的完善,早在古埃及、美索不達米亞及古印度內的古代數學文本便可觀見,而在古希臘那裡有更為嚴謹的處理。從那時開始,數學的發展便持續不斷地小幅進展,至16世紀的文藝復興時期,因为新的科學發現和數學革新兩者的交互,致使數學的加速发展,直至今日。数学并成为許多國家及地區的教育範疇中的一部分。\\n\\n今日,數學使用在不同的領域中,包括科學、工程、醫學和經濟學等。數學對這些領域的應用通常被稱為應用數學,有時亦會激起新的數學發現,並導致全新學科的發展,例如物理学的实质性发展中建立的某些理论激发数学家对于某些问题的不同角度的思考。數學家也研究純數學,就是數學本身的实质性內容,而不以任何實際應用為目標。雖然許多研究以純數學開始,但其过程中也發現許多應用之处。\\n\\n西方语言中“數學”()一詞源自於古希臘語的(),其有“學習”、“學問”、“科學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞(),意思為\"和學習有關的\"或\"用功的\",亦會被用來指\"數學的\"。其在英语中表面上的複數形式,及在法语中的表面複數形式\\',可溯至拉丁文的中性複數\\',由西塞罗譯自希臘文複數(),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。\\n\\n汉字表示的「數學」一詞大約产生于中国宋元時期。多指象數之學,但有時也含有今天上的數學意義,例如,秦九韶的《數學九章》(《永樂大典》記,即《數書九章》也被宋代周密所著的《癸辛雜識》記爲《數學大略》)、《數學通軌》(明代柯尚遷著)、《数学钥》(清代杜知耕著)、《數學拾遺》(清代丁取忠撰)。直到1939年,經過中國數學名詞審查委員會研究“算學”與“數學”兩詞的使用狀況後,確認以“數學”表示今天意義上的數學含義。\\n\\n數學有着久遠的歷史。它被認為起源於人類早期的生產活動:中國古代的六艺之一就有「數」,數學一詞在西方有希腊语詞源(mathematikós),意思是“学问的基础”,源于(máthema,“科学,知识,学问”)。\\n\\n史前的人類就已嘗試用自然的法則來衡量物質的多少、時間的長短等抽象的數量關係,比如时间单位有日、季節和年等。算術(加減乘除)也自然而然地產生了。古代的石碑及泥版亦證實了當時已有幾何的知識。\\n\\n更進一步則需要寫作或其他可記錄數字的系統,如符木或於印加帝國內用來儲存數據的奇普。歷史上曾有過許多不同的記數系統。\\n在最初有歷史記錄的時候,數學內的主要原理是為了做稅務和貿易等相關計算,為了解數字間的關係,為了測量土地,以及為了預測天文事件而形成的。这些需要可以简单地被概括为数学对數量、结构、空间及时间方面的研究。\\n\\n到了16世纪,算术、初等代数以及三角学等初等数学已大体完备。17世纪变量概念的产生使人们开始研究变化中的量与量的互相关系和图形间的互相变换,微积分的概念也在此時形成。随着數學轉向形式化,为研究数学基础而产生的集合论和数理逻辑等也开始发展。数学的重心从求解实际问题转变到对一般形式上的思考。\\n\\n從古至今,數學便一直不斷地延展,且與科學有豐富的相互作用,兩者的發展都受惠於彼此。在歷史上有著許多數學發現,並且直至今日都不斷地有新的發現。據Mikhail B. Sevryuk於2006年1月的期刊中所說,「存放於數學評論資料庫中論文和書籍的數量自1940年(數學評論的創刊年份)現已超過了一百九十萬份,而且每年還增加超過七萬五千份。此一學海的絕大部份為新的數學定理及其證明。」\\n\\n每當有涉及數量、結構、空間及變化等方面的困難問題時,通常就需要用到數學工具去解決問題,而這往往也拓展了數學的研究範疇。一開始,數學的運用可見於貿易、土地測量及之後的天文學。今日,所有的科學都存在著值得數學家研究的問題,且數學本身亦給出了許多的問題。牛頓和莱布尼兹是微積分的發明者,費曼發明了費曼路徑積分,這是推理及物理洞察二者的產物,而今日的弦理論亦引申出新的數學。一些數學只和生成它的領域有關,且用來解答此領域的更多問題。但一般被一領域生成的數學在其他許多領域內也十分有用,且可以成為一般的數學概念。即使是「最純的」數學通常亦有實際的用途,此一非比尋常的事實,被1963年諾貝爾物理獎得主維格納稱為「數學在自然科學中不可想像的有效性」。\\n\\n如同大多數的研究領域,科學知識的爆發導致了數學的專業化。主要的分歧為純數學和應用數學。在應用數學內,又被分成兩大領域,並且變成了它們自身的學科——統計學和電腦科學。\\n\\n許多數學家談論數學的\"優美\",其內在的美學及美。「簡單」和「一般化」即為美的一種。另外亦包括巧妙的證明,如歐幾里得對存在無限多質數的證明;又或者是加快計算的數值方法,如快速傅立葉變換。高德菲·哈羅德·哈代在《一個數學家的自白》一书中表明他相信單單是美學上的意義,就已經足夠作為純數學研究的正當理由。\\n\\n我們現今所使用的大部分數學符號在16世紀後才被發明出來的。在此之前,數學以文字的形式書寫出來,這種形式會限制了數學的發展。現今的符號使得數學對於專家而言更容易掌握,但初學者卻常對此望而却步。它被極度的壓縮:少量的符號包含著大量的訊息。如同音樂符號一般,現今的數學符號有明確的語法,並且有效地對訊息作編碼,這是其他書寫方式難以做到的。符号化和形式化使得数学迅速发展,并帮助各个科学领域建立基础支撑理论。\\n\\n數學語言亦對初學者而言感到困難。如“或”和“只”這些字有著比日常用語更精確的意思。亦困惱著初學者的,如“開放”和“域”等字在數學裡有著特別的意思。數學術語亦包括如“同胚”及“可積性”等專有名詞。但使用這些特別符號和專有術語是有其原因的:數學需要比日常用語更多的精確性。數學家將此對語言及邏輯精確性的要求稱為「嚴謹」。但在现实应用中,舍弃一些严谨性往往会得到更好的结果。\\n\\n嚴謹是數學證明中很重要且基本的一部份。數學家希望他們的定理以系統化的推理依著公理被推論下去。這是為了避免依著不可靠的直觀而推出錯誤的「定理」,而這情形在歷史上曾出現過許多的例子。在數學中被期許的嚴謹程度因著時間而不同:希臘人期許著仔細的論證,但在牛頓的時代,所使用的方法則較不嚴謹。牛頓為了解決問題所做的定義,到了十九世紀才重新以小心的分析及正式的證明來處理。今日,數學家們則持續地在爭論電腦輔助證明的嚴謹度。當大量的計算難以被驗證時,其證明亦很難說是足夠地嚴謹。\\n\\n公理在傳統的思想中是「不證自明的真理」,但這種想法是有問題的。在形式上,公理只是一串符號,其只對可以由公理系統導出的公式之內容有意義。希爾伯特計劃即是想將所有的數學放在堅固的公理基礎上,但依據哥德爾不完備定理,每一相容且能蘊涵皮亞諾公理的公理系統必含有一不可決定的公式;因而所有數學的最終公理化是不可能的。儘管如此,數學常常被想像成只是某種公理化的集合論,在此意義下,所有數學敘述或證明都可以寫成集合論的公式。\\n\\n卡爾·弗里德里希·高斯稱數學為「科學的皇后」。在拉丁原文\\',以及其德語\\'中,對應於「科學」的單字的意思皆為知識(領域)。而實際上,science一詞在英語內本來就是這個意思,且無疑問地數學在此意義下確實是一門「科學」。將科學限定在自然科學則是在此之後的事。若認為科学是只指物理的世界時,則數學,或至少是純數學,不會是一門科學。愛因斯坦曾如此描述:「數學定律越和現實有關,它們越不確定;若它們越是確定的話,它們和現實越不會有關。」\\n\\n許多哲學家相信數學在經驗上不具可否證性,且因此不是卡爾·波普爾所定義的科学。但在1930年代時,在數理邏輯上的重大進展顯示數學不能歸併至邏輯內,且波普爾推斷「大部份的數學定律,如物理及生物學一樣,是假設演繹的:純數學因此變得更接近其假設為猜測的自然科學,比它現在看起來更接近。」然而,其他的思想家,如較著名的拉卡托斯,便提供了一個關於數學本身的可否證性版本。\\n\\n另一觀點則為某些科學領域(如理論物理)是其公理為嘗試著符合現實的數學。而事實上,理論物理學家齊曼(John Ziman)即認為科學是一種公眾知識,因此亦包含著數學。在任何的情況下,數學和物理科學的許多領域都有著很多相同的地方,尤其是從假設所得的邏輯推論之探索。直覺和實驗在數學和科學的猜想建構上皆扮演著重要的角色。實驗數學在數學中的重要性正持續地在增加,且計算和模擬在科學及數學中所扮演的角色也越來越加重,減輕了數學不使用科學方法的缺點。在史蒂芬·沃爾夫勒姆2002年的著作《一種新科學》中他提出,計算數學應被視為其自身的一科學領域來探索。\\n\\n數學家對此的態度並不一致。一些研究應用數學的數學家覺得他們是科學家,而那些研究純數學的數學家則時常覺得他們是在一門較接近邏輯的領域內工作,且因此基本上是個哲學家。許多數學家認為稱他們的工作是一種科學,是低估了其美學方面的重要性,以及其做為七大博雅教育之一的歷史;另外亦有人認為若忽略其與科學之間的關聯,是假裝沒看到數學和其在科學與工程之間的交互影響,進而促進了數學在許多科學上的發展此一事實。這兩種觀點之間的差異在哲學上產生了數學是「被創造」(如藝術)或是「被發現」(如科學)的爭議。大学院系划分中常见「科学和数学系」,这指出了这两个领域被看作有緊密聯繫而非一樣。實際上,數學家通常會在大體上與科學家合作,但在細節上卻會分開。此爭議亦是數學哲學眾多議題的其中一個。\\n\\n如上所述,數學主要的學科最先產生於商業上計算的需要、了解數字間的關係、測量土地及預測天文事件。這四種需要大致地與數量、結構、空間及變化(即算術、代數、幾何及分析)等數學上廣泛的子領域相關連著。除了上述主要的關注之外,亦有用來探索由數學核心至其他領域上之間的連結的子領域:至邏輯、至集合論(基礎)、至不同科學的經驗上的數學(應用數學)、及較近代的至不確定性的嚴格研究。\\n為了闡明數學基礎,數學邏輯和集合論等領域被發展了出來。\\n\\n數學邏輯專注於將數學置在一堅固的公理架構上,並研究此一架構的結果。就數學邏輯本身而言,其為哥德爾第二不完備定理所屬的領域,而這或許是邏輯中最廣為流傳的成果-總存在一不能被證明而又為真的定理。\\n\\n現代邏輯被分成遞歸論、模型論和證明論,且和理論電腦科學有著密切的關連性,千禧年大獎難題中的P/NP問題就是理論電腦科學中的著名問題。\\n\\n數量的研究起於數,一開始為熟悉的自然數及整數與被描述在算術內的自然數及整數的算術運算。整數更深的性質於數論中有詳細的研究,此一理論包括了如費馬最後定理等著名的結果。數論還包括兩個被廣為探討的未解問題:孿生質數猜想及哥德巴赫猜想。\\n\\n當數系更進一步發展時,整數被視為有理數的子集,而有理數則包含於實數中,連續的量即是以實數來表示的。實數則可以被進一步廣義化成複數。數的進一步廣義化可以持續至包含四元數及八元數。從自然數亦可以推廣到超限數,它形式化了計數至無限的這一概念。另一個研究的領域為大小,這個導致了基數和之後對無限的另外一種概念:阿列夫数,它允許無限集合之間的大小可以做有意義的比較。\\n\\n許多如數及函數的集合等數學物件都有著內含的結構。這些物件的結構性質被探討於群、環、-{zh-cn:域;zh-tw:體}-等抽象系統中,該些物件事實上也就是這樣的系統。此為代數的領域。在此有一個很重要的概念,即廣義化至向量空間的向量,它於線性代數中被研究。向量的研究結合了數學的三個基本領域:數量、結構及空間。向量分析則將其擴展至第四個基本的領域內,即變化。\\n\\n创立于二十世纪三十年代的法国的布尔巴基学派认为:纯粹数学,是研究抽象结构的理论。\\n结构,就是以初始概念和公理出发的演绎系统。\\n布尔巴基学派认为,有三种基本的抽象结构:代数结构(群,环,域……),序结构(偏序,全序……),拓扑结构(邻域,极限,连通性,维数……)。\\n\\n空間的研究源自於幾何-尤其是欧几里得几何。三角學則結合了空間及數,且包含有著名的勾股定理。現今對空間的研究更推廣到了更高維的幾何、非歐幾里得幾何(其在廣義相對論中扮演著核心的角色)及拓撲學。數和空間在解析幾何、微分幾何和代數幾何中都有著很重要的角色。在微分幾何中有著纖維叢及流形上的微積分等概念。在代數幾何中有著如多項式方程的解集等幾何物件的描述,結合了數和空間的概念;亦有著拓撲群的研究,結合了結構與空間。李群被用來研究空間、結構及變化。在其許多分支中,拓撲學可能是二十世紀數學中有著最大進展的領域,並包含有存在已久的龐加萊猜想,以及有爭議的四色定理。龐加萊猜想已在2006年确认由俄罗斯数学家格里戈里·佩雷尔曼證明,而四色定理已在1976年由凱尼斯·阿佩爾和沃夫岡·哈肯用電腦證明,而從來沒有由人力來驗證過。\\n\\n了解及描述變化在自然科學裡是一普遍的議題,而微積分更為研究變化的有利工具。函數诞生於此,做為描述一變化的量的核心概念。對於實數及實變函數的嚴格研究為實分析,而複分析則為複數的等價領域。黎曼猜想-數學最基本的未決問題之一-便是以複分析來描述的。泛函分析注重在函數的(一般為無限維)空間上。泛函分析的眾多應用之一為量子力學。許多的問題很自然地會導出一個量與其變化率之間的關係,而這在微分方程中被研究。在自然界中的許多現象可以被動力系統所描述;混沌理論則是對系統的既不可預測而又是決定的行為作明確的描述。\\n離散數學是指對理論電腦科學最有用處的數學領域之總稱,這包含有可計算理論、計算複雜性理論及資訊理論。可計算理論檢驗電腦的不同理論模型之極限,這包含現知最有力的模型-圖靈機。複雜性理論研究可以由電腦做為較易處理的程度;有些問題即使理論是可以以電腦解出來,但卻因為會花費太多的時間或空間而使得其解答仍然不為實際上可行的,儘管電腦硬體的快速進步。最後,資訊理論專注在可以儲存在特定媒介內的資料總量,且因此有壓縮及熵等概念。\\n\\n作為一相對較新的領域,離散數學有許多基本的未解問題。其中最有名的為P/NP問題-千禧年大獎難題之一。一般相信此問題的解答是否定的。\\n\\n應用數學思考將抽象的數學工具運用在解答科學、工商業及其他領域上之現實問題。應用數學中的一重要領域為統計學,它利用機率論為其工具並允許對含有機會成分的現象進行描述、分析與預測。大部份的實驗、調查及觀察研究需要統計對其資料的分析。(許多的統計學家並不認為他們是數學家,而比較覺得是合作團體的一份子。)數值分析研究有什麼計算方法,可以有效地解決那些人力所限而算不出的數學問題;它亦包含了對計算中捨入誤差或其他來源的誤差之研究。\\n\\n數學獎通常和其他科學的獎項分開。數學上最有名的獎為菲爾茲獎,創立於1936年,每四年頒獎一次。它通常被認為是數學的諾貝爾獎。另一個國際上主要的獎項為阿貝爾獎,創立於2003年。兩者都頒獎於特定的工作主題,包括數學新領域的創新或已成熟領域中未解決問題的解答。著名的23個問題,稱為希爾伯特的23個問題,於1900年由德國數學家大衛·希爾伯特所提出。這一連串的問題在數學家之間有著極高的名望,且至少有九個問題已經被解答了出來。另一新的七個重要問題,稱為千禧年大獎難題,發表於2000年。對其每一個問題的解答都有著一百萬美元的獎金,而當中只有一個問題(黎曼猜想)和希爾伯特的問題重複。\\n\\n\\n\\n\\n\\n\\n\\n哲学\\n\\n哲學()是研究普遍的、根本的问题的学科,包括存在、知识、价值、理智、心灵、语言等领域。哲学与其他学科的不同是其批判的方式、通常是系统化的方法,并以理性论证為基礎。在日常用语中,其也可被引申为个人或团体的最基本信仰、概念或态度。\\n\\n英語詞語()源于古希臘語中的,意思為「愛智慧」,有时也译为「智慧的朋友」,该词由(philos,爱)的派生词(Philein,去爱)和(Sophia,智慧)组合而成。一般认为,古希腊思想家毕达哥拉斯最先在著作中引入“哲学家”和“哲学”这两个术语。\\n\\n“哲”一词在中国起源很早,如“孔门十哲”,“古圣先哲”等词,“哲”或“哲人”,专指那些善于思辨,学问精深者,即西方近世“哲学家”,“思想家”之谓。在《易經》當中已經開始討論哲學問題,形而上学的中文名稱取自《易經·繫辭上傳》「形而上者谓之道,形而下者谓之器」一語。1874年,日本啟蒙家西周,在《百一新論》中首先用漢文「哲學」來翻譯\"philosophy\"一詞。\\n\\n英国哲学家罗素对哲学的定义是:\\n\\n胡適在《中国哲学史大纲》中称「凡研究人生切要的问题,从根本上着想,要寻一个根本的解决:这种学问叫做哲学」。\\n\\n雖然哲學源自西方的傳統,但許多文明在歷史上都存在著一些相似的論題。東亞和南亞的哲學被稱之為東方哲學,而北非和中東則因為其和歐洲密切的互動,因此常被視為是西方哲學的一部份。\\n\\n對哲學的主題亦存在許多看法。一些人認為哲學是對問題本身過程的審查;另外一些人則認為實質上存在著哲學必須去回答的哲學命題。\\n\\n\\n古希臘哲學家透過問問題來進行哲學實踐,他們所提的問題大概可以歸類為三類,這三類問題分別形成了哲學的基礎學科——分别是形而上学、伦理学、认识论(或知识论) 。\\n\\n有意思的是,现代哲学上蒙现出\"不要求精确理由\"的哲学论调,如\"本质技巧\"(认定本质不可知),这种现象将不可知论(世界上终究有人不能理解的存在)的重要性提高了。\\n\\n哲學可以分为很多不同的分支,主要包括形而上學、知識論、倫理學、邏輯學和美學。\\n\\n\\n很多人类社群思考过哲学问题并且互相学习建立了各种哲学流派。\\n\\n东方哲学是通过每个地区的历史时期来组织的。西方哲学一般可以分为三个或更多时期,最重要的是古典哲学、中世纪哲学和近代哲学。\\n\\n印度哲學的歷史源遠流長,早在吠陀時代已經開始,至公元前6世紀為全盛時期。當時古印度的思想界百花齊放,其中最著名的包括佛教創始人釋迦牟尼佛、耆那教創始人笩駄摩那、阿耆多·翅舍欽婆羅、波拘陀·迦旃延、富蘭那·迦葉、數論派等。\\n\\n中國哲學的主要部分起源東周時期,当时以诸子百家广为人知,以孔子的儒家、老子的道家、墨子的墨家及晚期的法家為代表,还有一些流派例如农家、阴阳家和名家在之后则名声不显。在秦朝焚书坑儒后除了法家、儒家、道家外其他流派都不再活跃。在當代,中國哲學仍然在亞洲文化扮演一定作用,但是學理上仍在爭辯中國哲學是否應歸為哲學。\\n\\n古希腊-哲学是西方哲学的一个时期,时间为公元前6世纪[约585]到公元6世纪。它一般被分为三个时期:前苏格拉底时期、柏拉图和亚里士多德的古典希腊时期、和后亚里士多德(或希腊化)时期:有时候会把新柏拉图主义和基督教哲学家们的古典时代晚期加入作为第四个时期。\\n\\n在公元前6世纪的希腊,西方哲学就从古代神话和诗歌中脱颖而出,逐步开始对宇宙的组成以及本源的思考而开始了独立发展。前苏格拉底时期的自然派哲學家们多关注自然界,被認為是西方最早的哲學家,不管他們認識以及解释世界的方式是否正確,但是他們的想法之所以有別於迷信的原因在於,這些哲學家是以理性輔佐證據的方式歸納出自然界的现象。诸如:\\n\\n\\n公元前5世纪中期,普罗泰戈拉和高尔吉亚等所形成的辯士學派将研究的重点由自然转移到人类本身。认为“人才是万物之本”。他们都不相信有真正的存在和真理。普罗泰戈拉认为是非善恶都是相对于人的感觉而言,而高尔吉亚却认为所有的都是同样的假,这是怀疑论的雏形。 \\n\\n公元前6世纪末,以毕达哥拉斯为主的毕达哥拉斯学派所主张的哲学与前述的观点既相近又有不同。罗马古代的历史上记载毕达哥拉斯第一个称自己为哲学家,或者说是爱智慧。他认为“一切都是数字”。其意思就是说一切事物的实质和结构都是它们所包含的数字关系所决定的。他称平均、秩序和调和是宇宙的三大基调,并以音乐的调和说明宇宙的调和。他所在的学派将宇宙总结为十种性质相异的组合:有限与无限、奇与偶、一与多、左与右、男与女、静与动、直与曲、光明与黑暗、善与恶、方与圆。至此之后,数学的本质及其地位,一直都是哲学的主要问题之一,数学不受观察和实验造成的不确定性影响,而且是通过纯粹的思想加以理解的。\\n\\n其中关于变与不变的关系的争论,真实世界与直觉世界的差别,真理与意见的矛盾,导致产生了认识论的问题。\\n\\n在古典希腊时期西方哲学方法的关键特质被建立:依靠诉诸理性和论证,通过一种批判性的方法来接受或建立观点。这包括苏格拉底被称为蘇格拉底反詰法或“反驳论证”方法的辩证法,他主要用其来检验例如善良和公平正義的关键道德概念。这种方法将一个问题分解成一系列的疑问,在对疑问的回答中逐步提取想要找到的答案,其极大影响可以从现在使用的科学方法中看出,在科学方法中假说是第一个阶段。\\n\\n苏格拉底没有直接教过人,但之后的柏拉图深受其影响。而其整个哲学思想来源于两大理论:其一,永远不要做坏事;其二,一个内心真正善良且正义的人绝不会做相反之事。他认为真理有其客观性,试图推翻智者们以个人主观感觉为真理的思想。然后提出德的概念,以作为人生行事的方向。对于道德是什么的问题,苏格拉底的回复为“知识即道德。”对于知识是何物的问题,他回答说知识是透过理性而得的概念。苏格拉底开创了认识论和伦理学,如此奠定了他的哲学地位。\\n\\n古典希臘時期的的哲学家中柏拉图和亚里士多德对后世的影响力最大,特别是柏拉图被认为是西方哲学的创始人。哲学家阿爾弗雷德·諾思·懷特黑德评价柏拉图:“欧洲哲学传统最被普遍公认的特点,就是它包含了一系列对柏拉图的注脚。我的意思不是怀疑学者们系统体系的思想是提取自柏拉图的著作。我暗示的是那些他们散落的一般思想的财富。”換言之即使數千年後,人們依舊在試著回答他所提出的問題,這也代表著人們依然為這些問題或是這些問題所延伸的更多問題而感到困惑。\\n\\n毕达哥拉斯的思想对柏拉图产生了显著地影响,并通过柏拉图影响了整个西方哲学。柏拉图和亚里士多德作为最早的古典希腊哲学家批判地引用了其它的一些”智者“,当时这些人在希腊被称为“辩士”并在毕达哥拉斯之前相当普遍。从他们的批判看来,在他们的古典时代一个在更高尚地、纯粹地”爱智慧”(真的哲学家)与那些更早更普遍的旅行教师——经常也通过自己的技艺来赚钱——之间的分水岭之后被建立。\\n\\n亚里士多德死后,整个哲学界陷入了独立时期,称为时期。因为整个社会和政治陷入混乱。这段时期产生了斯多葛学派和伊壁鸠鲁学派,以及怀疑主义派、新柏拉图派和。这些学派的共同特点是伦理化。斯多葛学派主要是顺应自然和自制。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有宗教主义的哲学,并逐渐产生融化基督教和希腊哲学于一体的理论,即为后来的基督教哲学。\\n\\n直到公元529年,罗马皇帝查士丁尼一世尼命令关闭雅典的柏拉图学院。称一些余下的学院成员逃入了萨珊王朝首都泰西封。\\n\\n印度哲學是指起源於印度次大陸的哲學思想,包括、、等,這些印度哲學具有一些共同且複雜的起源,都有有關佛法及業的主題,而且都希望達到個人的解放。這些哲學約在西元前一世紀到西元幾世紀的時間成形。\\n\\n中世纪哲学指的是西欧和中东在中世纪的哲学体系,其时间范围没有定论,大致上是从基督化的罗马帝国时期至文艺复兴时期。中世纪哲学被部分定义为对古典希腊和希腊化哲学的再发现和进一步发展,另一部分是需要解决神学问题并把亞伯拉罕諸教(伊斯兰教、犹太教和基督教)的教条同世俗知识一同整合并推广。\\n\\n文艺复兴人文学者们排斥中世纪时期,把它当作在希腊罗马的古典时代与古典文化“复兴”之间的一个“过渡”的野蛮时期。然而在中世纪这将近一千年中哲学在欧洲仍取得了长足地发展。认为\"在强度、复杂度还有成就上,可以确信地说哲学在十三世纪的兴盛能与公元前四世纪古希腊哲学的黄金时期媲美。\"\\n\\n这个时代讨论的问题有信仰和理智的关系,神的存在与统一,神学话题和形而上学,关于知识、宇宙和个人的问题。\\n\\n中世纪的哲学家包括基督教学者希波的奥古斯丁、波爱修斯、安瑟倫、、皮埃尔·阿伯拉尔、罗吉尔·培根、圣文德、托马斯·阿奎那、邓斯·司各脱、奥卡姆的威廉和让·布里丹等;犹太哲学家迈蒙尼德和;还有穆斯林哲学家肯迪、法拉比、海什木、伊本·西那、安薩里、伊本·巴哲、、伊本·赫勒敦和伊本·魯世德等。中世纪的经院哲学传统一直到17世纪仍在活跃,例如和等人物。其中托马斯主义之父阿奎那极大地影响了整个天主教欧洲,他特别强调理性和论证,是最先开始使用亚里士多德形而上学和知识论的著作的新译本的学者之一。他的工作明显远离了统治大部分早期经院哲学的新柏拉图主义和奥古斯丁的思想。\\n\\n从文艺复兴开始,人们的思想开始从清净的僧院走出,来到喧嚣的尘世。从而发展自然,也发展人类自身。从而形成人文主义和自然哲学两股既有联系又有区别的思潮。\\n\\n\"文艺复兴\"是对中世纪到近代之间过渡时期的通称,那时对古典文献的重新学习帮助把哲学界的兴趣从对逻辑学、形而上学和神学领域的钻研转移到包括道德、语言学和神秘主义的更加广泛的研究。对经典和人文艺术例如历史学和文学的研究在基督教世界学术界中享有前所未有的兴趣,这个趋势被称为人文主义,它受到柏拉图主义、希腊怀疑主义和罗马斯多葛主义的影响。人文主义者的哲学兴趣跟随彼特拉克转移到造物主与其美德上,替代了中世纪时对形而上学和逻辑学的兴趣。\\n\\n那时对古典哲学的研究出现了两种新方式。一方面对亚里士多德的研究因为的影响而产生了变化。阿威罗伊亚里士多德主义者和更正统的天主教亚里士多德主义者譬如艾爾伯圖斯·麥格努斯和托马斯·阿奎那之间的分歧最终在文艺复兴发展出一种“人文亚里斯多德哲学”,譬如和的思想。另一方面,在一些之前不为西欧所熟知的作品的重发现的帮助下,对柏拉图和新柏拉图主义的研究作为另一个选择变得普遍起来。著名的文艺复兴时期的柏拉图主义者包括库萨的尼古拉,还有之后的马尔西利奥·费奇诺和若望·皮科·德拉·米蘭多拉。\\n\\n文艺复兴也重新产生了对反亚里士多德的把自然看作一个有机的、活生生的整体而不取决于神学的理论的兴趣,例如在库萨的尼古拉、尼古拉·哥白尼、焦爾達諾·布魯諾、还有托马索·康帕内拉的著作中。在自然哲学中这样的运动与对神秘主义、魔法、赫尔墨斯主义还有占星学等兴趣重燃相契合,它们被认为隐藏着收获知识和掌控自然方法的大门。\\n\\n这些新的哲学运动伴随着欧洲宗教和政治的剧变同时出现:宗教改革和封建制的衰落。虽然参与宗教改革的神学家们对哲学没有直接的兴趣,他们打破了神学和知识权威的传统基础。同时还伴随着信仰主义和怀疑主义的复兴,体现在伊拉斯谟,蒙泰涅和等思想家身上。同时,民族国家政治上逐步的中央集权的过程得到了世俗政治哲学的响应,如尼可罗·马基亚维利(常被描述为第一个现代政治思想家,或者是现代政治思想形成的关键点)、托马斯·莫尔、伊拉斯谟、尤斯图斯·利普修斯、让·博丹和胡果·格老秀斯等的著作。\\n\\n先秦諸子之后的兩漢經學、魏晉玄學等都是中國哲學的一部份,自唐朝起佛教也开始对哲学产生重要影响;不过中世纪中国哲学最主要的部分是宋明理學的发展。\\n\\n宋明理学反对汉代后开始影响儒学的道教和佛教中的迷信和神秘的元素,是一股倡导更加理性和世俗化儒学的哲学运动。尽管理学遭到道教和佛教徒的批评,理学仍借鉴了它们两个的部分术语和概念。然而和佛教和道教把形而上学看作心灵发展、宗教启示的催化剂并且是不朽的不同,宋明理学把形而上学当作建立一个理性的伦理体系的指导。宋明理学的起源可以追溯到唐朝:韩愈和李翱被视为宋代理学的先驱。宋代理学家周敦颐以道教形而上学理论为框架建立了他的伦理哲学体系,他被看作是宋明理学的创始人。\\n\\n在东亚的其他地方,日本哲学形成于本土的神道信仰和佛教、儒家以及另一些中国哲学和印度哲学学派混合发展。与日本类似,在中巫教的情绪化内容被混合到了从中国传入的理学当中。\\n\\n主條目:近代哲學\\n\\n西方哲学史上的近代早期一般指17世纪和18世纪,其中18世纪常被称为启蒙时代。现代哲学不同于其前身,它和传统权威例如教会、学院、亚里士多德的关系更加独立,出现了对知识基础和形而上学体系建设的新兴趣;和摆脱了自然哲学的近代物理学的出现。从17世纪开始,近代哲学就以认识论为研究重点。由于经验论(经验主义)与唯理论(理性主义)的争论,使物质与精神的关系作为认识论的首要问题突显出来。\\n当时其他的哲学焦点包括精神的天性和其与身体的关系,新的自然科学对诸如自由意志和神的传统上属于神学的话题的影响,和伦理学和政治哲学的世俗基础\\n。这种潮流最早被鲜明地体现在弗兰西斯·培根的被称为用来扩展知识的新的、经验主义的程序,并很快在笛卡儿的机械主义物理学和理性主义的形而上学中建立了具有巨大影响力的形式。培根运用归纳法,第一个提出思维的主体“人”应该主动干涉自然来为人服务。\\n\\n近现代政治哲学的鼻祖托马斯·霍布斯最早将这套方法论系统得应用在政治哲学上,包括\"社會契約\"的近代理论。早期近代哲学的学术经典一般包括笛卡尔、斯賓諾莎、莱布尼茨、洛克、贝克莱、休谟和康德。同时其的其他思想家也对哲学做出了贡献,例如伽利略、皮埃尔·伽桑狄、布莱兹·帕斯卡、马勒伯朗士、艾萨克·牛顿、、孟德斯鸠、、托马斯·里德、让·勒朗·达朗贝尔和亚当·斯密,而让-雅克·卢梭是反启蒙运动的开创性人物。早期近代哲学的大致结束通常被确定为伊曼努尔·康德的试图限定形而上学范围、证明科学知识并用道德和自由来调和两者的体系。\\n\\n理性主义者中勒内·笛卡儿认为物质世界是由数学关系组成的单一体系,他企图将物理学转化为数学。他在其著作中,对整个经院哲学以及在他那个时代流行的教育与哲学体系加以讽刺。其认为“我思故我在”是认识论的无可怀疑之出发点。笛卡尔是割裂精神和物质的二元论者,为了厘清二者关系,他坚定认为在上帝那里,精神和物质是统一的。其理论被称为笛卡尔主义\\n。斯宾诺莎是笛卡尔之后,又一位著名的唯理论者。他的认识论、几何学和机械观都来自于笛卡尔。但他不认同笛卡尔的二元论,认为精神和物质不过是唯一实体的两种属性\\n。莱布尼茨作为唯理论者坚定地维护笛卡尔的学说并反驳约翰·洛克的理论。与笛卡尔不同的是,他认为万物的实体是“单子”,且互相没有关系,而是由于“前定和谐”才共存一体,即存在于神之中。“前定和谐”调和了笛卡尔之二元论和斯宾诺莎之实体双重性。\\n洛克发展了经验论,他不认同笛卡尔的“天赋观念”,提出白板说,他强调人们从感觉中抽象出普遍的概念,认为感觉中的个别东西才是第一位的。不过他基本认同笛卡尔的二元论。贝克莱发展了洛克的哲学理论,提出了“存在就是被感知”。他认为除了感知的主题和被感知的知觉之外,什么也没有。他非常不赞同物质的抽象概念,认为其既无客观实在,也不能存在于人心。大卫·休谟的理论比贝克莱的更进一步,他不仅仅认为物质实体不存在,更认为精神实体不存在。只承认知觉的存在。他还以自己的不可知论和怀疑论认为不存在统一性和普遍性的东西,认定多样性和个别性才是最高原理。\\n\\n经验论与唯理论的争论也包含了唯物主义与唯心主义的争论。在18世纪时,法国的拉美特利公开宣布唯物主义是唯一的,而百科全书的主编德尼·狄德罗也拒绝承认神的存在。\\n\\n另外伏尔泰,孟德斯鸠和其他百科全书派的学者都有涉及政治和伦理领域。他们都认为机械主义才是最终形式——物质是唯一的且处于永恒运动的,精神只是人脑的属性。因此他们认为无机物与有机物不可逾越,人的思维是人感官的结果。不过他们仍然是经验主义者,在因果性上,他们认为只有必然性才是唯一的,这就成为唯心主义的观念。\\n\\n从18世纪中后期开始,直到19世纪初,哲学便进入了近代哲学的总结时期,这就是德国古典哲学时期。有两条线索标志着转折的到来:一、思维与存在的关系更加明确;二、产生了系统辩证法。其代表人物有I.康德、J.G.费希特、F.W.谢林、G.W.F.黑格尔等。\\n\\n康德给哲学带来了三个标志性的创造:\\n\\n\\n他受到休谟的诸多影响,并为西方哲学带来一次革命。他认为哲学的研究核心就是规定理性能做什么以及不能做什么。\\n\\n康德同意休谟的理论并认为,存在一些原则,使得心灵对经验和认识加以组织,而证据皆可以在数学中找到。即是,包含在命题里的要比包含在原是概念的定义要多得多。他使用称之为批判哲学的先验方法,来展现经验的某些范畴和形式都必然地被预先存在于人们一切言谈之中。\\n\\n凭借着他的三部“批判性”的著作,为先验方法作出相应的结构:\\n\\n\\n他还为道德哲学奠定了新基础,且他赋予了自由概念的新意义。因为其影响在现代依旧尚存,其理论被人们称为康德主义。\\n费希特本来承认斯宾诺莎的机械的因果决定论,但后来受到康德的影响,开始认为因果决定论只是表面,其实质为自我不是必然性的奴仆而是独立自由的主体。就此,他建立了主观的思维与客观的存在之统一说。\\n\\n谢林是从费希特理论出发的,但深受斯宾诺莎和文学上浪漫主义的影响,创立了自己的学说。即他认为自然和精神、存在和思维,客体和主体,表面相反,实则统一,是同一个“绝对”的不同发展阶段,这个“绝对”即是万事万物的根源。他认为艺术才是最直观的理性。\\n黑格尔及其理论的出现将西方哲学的推上一个新高度,他创立了西方哲学史上最庞大的客观唯心主义体系,并系统地阐述了辩证法。他的理论和学说对近现代哲学产生了很深远的影响,并被称为黑格尔主义。\\n\\n从黑格尔的思想体系中发展而成的多种哲学运动。其重点就是以历史和逻辑为主,历史方面,它从不同角度理解“凡是合理的就是现实的”;逻辑方面,它有发现其中所说的“真理即整体”。\\n\\n黑格尔认为哲学的重点是放弃分裂,达到统一。他把以前的时代说成是思维与存在、理想与现实分裂,自由与必然,个人与社会、无限与有限、统一性与多样性分裂之时代。\\n\\n他从康德的“心灵的合理性以及在经验中的积极作用”的概念出发,但反对康德的“超越经验世界和‘物自身’的世界”,并认为心灵和世界一样具有相同基础理性结构。他所认为的普遍性不是脱离特殊的抽象普遍,而是包含特殊在内之普遍,即为具体普遍;他所认为的统一也非脱离矛盾、对立的抽象统一,而是包含它们在内的统一,即为对立统一。上述综合在一起即是他的理论:最真实的无所不包的整体即是“绝对精神”,又是对立的统一。\\n\\n他认为,为了达到这个“绝对精神”,需要经过三个阶段,从逻辑、自然到精神,即是从思维到存在,再到两者统一的过程,从而完成他的统一论。\\n\\n就此,社会和历史的现象,便被赋予一种在哲学史上还是崭新的显赫地位。他还将伦理学划归到这个领域,从而在伦理学理论和对思想的理解中提出重要的路线。\\n\\n从19世纪中叶开始,西方哲学就进入现代哲学阶段。因为在19世纪中期,欧洲的工业革命几近完成。\\n\\n现代哲学,特别是19世纪中后期的哲学流派,有叔本华的意志主义,新康德主义,新黑格尔主义,马克思主义。然而此時的哲學與後來的存在主義、現象學等在當代一般歸為「歐陸哲學」,與二十世紀以後著重嚴謹邏輯與語詞分析所發展出的「分析哲學」成為風格迥異的兩大西方哲學典範。\\n\\n20世纪的西方哲学上主流有两条:\\n\\n现代哲学主要包含以下几种潮流。\\n历程哲学:\\n\\n主流马克思主义:\\n\\n西方马克思主义:\\n\\n革新的黑格尔主义:\\n\\n结构主义:\\n\\n分析哲学:\\n\\n实证主义:\\n\\n新康德主义:\\n\\n逻辑实证主义:\\n\\n语言哲学:\\n\\n现象学:\\n\\n唯物论:\\n\\n新托马斯主义:\\n\\n科學哲學:\\n\\n意志主义:\\n\\n实用主义:\\n\\n存在主义:\\n\\n解释学:\\n\\n唯心主義的各种变体在18世紀晚期至20世紀早期的哲学界相当流行。康德主张的先验唯心主义认为人们对事物的理解是有界限的,因为在客观判断条件下很多事情是办不到的。他在1781年發行的作品《纯粹理性批判》試圖調和18世紀兩大主要的哲學派別:經驗主義和理性主義,并且建立一个研究形而上学的新基础。\\n\\n德國唯心主義最著名的作品是黑格尔于1807年出版的《精神现象学》。黑格尔承认自己的理念不是新的,不过他的目标是完成之前的哲学家们的不完整的体系。黑格尔认为哲学的重点是放弃分裂,达到统一。他把以前的时代说成是思维与存在、理想与现实分裂,自由与必然、个人与社会、无限与有限、统一性与多样性分裂之时代。他从康德的“心灵的合理性以及在经验中的积极作用”的概念出发,但反对康德的“超越经验世界和‘物自身’的世界”,并认为心灵和世界一样具有相同基础理性结构。他所认为的普遍性不是脱离特殊的抽象普遍,而是包含特殊在内之普遍,即为具体普遍;他所认为的统一也非脱离矛盾、对立的抽象统一,而是包含它们在内的统一,即为对立统一。上述综合在一起即是他的理论:最真实的无所不包的整体即是“绝对精神”,又是对立的统一。黑格尔认为需要经过三个阶段来达到这个“绝对精神”,从逻辑、自然到精神,即是从思维到存在,再到两者统一的过程,从而完成他的统一论。他还将伦理学划归到这个领域,从而在伦理学理论和对思想的理解中提出重要的路线。\\n马克思主义哲学是马克思和恩格斯建立的以辩证唯物主义为核心的哲学体系。其认为实践是检验哲学之真理性的最终标准,哲学应伴随着社会、科学技术和文化的发展而不断发展。其主要思想体系在19世纪70年代主要由恩格斯创立,20世纪20年代在苏联形成完整体系——辩证唯物主义和历史唯物主义,这个体系在后来的社会主义国家推动下得以发展。马克思主义哲学宣称自己的理论体系具有科学性,认为哲学可以成为科学的一部分。同时马克思主义哲学认为哲学还具有意识形态的性质。\\n\\n另外马克思主义在政治上也指各种不同的共产主义运动,如由列宁所创立而被斯大林修改的苏联马克思主义,称为马克思列宁主义,为俄国革命以及后来建立的各种共产党之教义。它的旁系包括反斯大林的托洛茨基及其追随者的马克思主义、毛泽东的马克思列宁主义等。\\n實用主義產生於19世紀70年代的現代哲學派別,在20世紀的美國成為一種主流思潮。對法律、政治、教育、社會、宗教和藝術的研究產生了很大的影響。實用主義也試圖在理性主義及經驗主義找出一條中間道路來,是「經驗主義思想方法與人類的比較具有宗教性需要的適當的調和者。」\\n\\n現象學是由德國哲學家胡塞爾在1900年提出的理論,強調對直接直觀和經驗感知的區分,認為哲學(或至少是現象學)的主要任務是釐清二者之間的關聯,並且在直觀中獲得對本質的認識。现象学是对经验结构与意识结构的哲学性研究。作为一个哲学运动,现象学于二十世纪早期由埃德蒙德·胡塞尔创立,之后被他在德国的哥廷根大学和慕尼黑大学中的一派追随者发展壮大。在此之后现象学传播到法国、美国以及其他地区,并远超出了胡塞尔早期著作的语境。 其他主要哲學家包括海德格(Martin Heidegger), 梅洛—龐蒂(Maurice Merleau-Ponty), 以及列維納斯(Emmanuel Lévinas)。\\n\\n存在主义是一个哲学的非理性主义思潮,该术语被用在十九世纪晚期到二十世纪的一些哲学家的工作上,尽管他们的学说相差巨大,但他们都相信哲学思考开始于人类主体——而不仅仅是思维主体,而且包括行为、感知、人类个体。存在主义强调个人、獨立自主和主观经验,認為人存在的意義是無法經由理性思考而得到答案。在存在主义中,个体的出发点的特征是被称为“存在的态度”,或一种面对显然是一个无意义的或荒谬的世界的迷失和混乱的感觉。很多存在主义者还认为传统的体系和哲学学术无论是内容和风格都过于抽象并远离人类经验。\\n\\n19世纪哲学家克尔凯郭尔和尼采被看作存在主义的先驱,尽管他们没有使用这个术语。然而他们的影响延伸出了存在主义思想。克尔凯郭尔著作主要针对的是黑格尔的唯心主义哲学体系,他认为其忽视或排除了人类的内在主观生命。相反克尔凯郭尔认为\"真理是主观的\",主张对一个现实的人类来说最重要的问题是处理个人与存在内在关系的问题。克尔凯郭尔作为一个基督徒相信宗教信仰的真相是一个主观问题,而且人应该用热情去深思这个问题。\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n文學\\n\\n文學(),在最廣泛的意義上,是任何單一的書面作品。更嚴格地說,文學寫作被認為是一種藝術形式,或被認為具有藝術或智力價值的任何單一作品,通常是由於以不同於普通用途的方式部署語言。它的拉丁詞根\"literatura\"/\"litteratura\"(本身起源於\"littera\":\"letter\"或\"handwriting\")被用來指代所有的書面記錄,儘管當代定義將術語擴展到包括口頭或唱歌的文本(口頭文學)。文學可以根據是虛構作品還是非虛構作品進行分類,也可以根據是韻文還是散文進行分類;可以根據長篇小說、中篇小說、短篇小說等主要形式進一步區分;作品往往根據歷史時期或者遵守某些美學特徵或期望(藝術類型)進行分類。以语言文字为工具形象化地反映現實的藝術,包括韻文、散文、劇本、小说等,是文化的重要表现形式,以不同的流派表现内心情感和再现一定时期和一定地域的生活。\\n\\n這個概念隨著時間的推移而改變了意義:現在它可以擴大到非書面的口頭藝術形式,可以與語言或文字本身配合,因此很難就其起源達成一致。印刷技術的發展使得書面作品的分佈和擴散成為可能,最終導致了網絡文學。\\n\\n文學並不一定是客觀的,一名成功的文學家能在自己的文學作品中,展現自己對於文學的主觀看法,抒發自己的情緒和感觸,但藉由嘗試建立一個「客觀的標準」,有時對能幫助作家了解「讀者的感受」以求將內心之情感與藝術表現完整的體現在讀者心中。有時也能藉作家主觀想法帶給社會不同面相去省思現況,例如女性文學的興起。\\n\\n文學的歷史和文明發展有密切的關係。若將文學定義為用文字記錄的作品,最早的古代文學作品一般認為是古埃及文學及。古埃及文學中主要的文類(讚美詩、祈禱文及故事)幾乎都是以詩的方式寫成的,不過雖然可以清楚看出有使用詩歌技巧(poetic devices),但詩歌的韻律不明。最早已知的文學作品是公元前2700年一篇由蘇美人创作的《吉爾伽美什史詩》,當中描述英雄主義、友誼、損失及追逐永生。\\n\\n不同的歷史時期有著不同特色的文學。古代的文學中有許多有關世界起源及習俗起源內容,也有一些其中有道德及靈性意涵的神話。铁器时代的荷马史詩及以較晚一些的有較多有關作者的資訊,而許多的神話則是用口頭傳播的方式流傳下來。\\n\\n各種文學都可以視為是文字的紀錄,文學本身可能是寫實或是虛構,但都可以描繪出一些事實,例如主角的動作及言語、作者的寫作風格,以及文字後的含義等。這些情節不只是娛樂性的,其中也包括了經濟、心理、科學、宗教、政治、文化及社學的相關資訊。在學習歷史時,研究及分析當時的文學也是重要的一部份。研究過去的文學可以看到不同歷史時期時,其社會和社會規範的演變,甚至於也可助於瞭解現今的文學,因為其中常常引用古希臘神話、宗教典藉及相關文獻的資料。人們不止可以從各主題相關的文學中看到該主題隨著歷史的演進(例如從經濟史的書或介紹科學及演化的書),甚至連科幻小說中都可看到類似的內容。作者常常在其作品中加入一些歷史的內容,例如拜倫勳爵在《Childe Harold’s Pilgrimage: Canto I》中藉由主角Childe Harold提到西班牙文及法文,也提到作者的一些想法。藉由文學人們可以繼續的發現有關歷史的新資訊,這個從各個學科領域都有和文學相關的子領域可以看出。當人們將資訊用文字的方式紀錄下來,就比較容易從這一代流傳到下一代,留下來的資訊會越來越多。從這些資料,人們可以研究文學、提昇想法、擴展知識、也可以開始像醫學或是貿易等專業領域的研究。而隨著現代人們學習內容的增加及拓展,文學也會有一些不同,成為以後人們研究的基礎。\\n\\n許多古文明都有其對哲學或是相關觀點的文學,像是古中國、古印度、波斯時及希臘羅馬古典时代的作品。許多古代的作品,就算是敘事的形式,都還是有道德或是教誨上的目的,像梵語的《五卷书》或是奥维德的《变形记》,後來戏剧及讽刺作品的受眾也變多,因此也開始有類似性質的文學創作。抒情诗常常是貴族圈的特性產物,特別在東亞,許多歌曲被貴族收集,成為詩歌。\\n\\n浪漫主義的異常特質在中世紀綻放。同時,理性時代造就了民族主義史詩與哲學短文。浪漫主義強調通俗的文學及情感的投入,慢慢被尋求真實的現實主義與自然主義文學所取代。到了20世紀,象徵主義抬頭,探索角色的描述和發展。\\n\\n在很长一段时间,中国的文学与史学和神话并无明显的界限,最早的文学是对历史和神话的记录。但纯粹的文学早在周时就已出现,例如《诗經》。中國古代的文學主要著重在哲學、史学史、军事学、農業及韻文。中國發明了造紙術及雕版印刷,也是世界上第一個。中國的許多哲學思想是起源自春秋戰國時的诸子百家,其中最重要的有儒家、道家、墨家及法家,而軍事學書籍(如孙子兵法)也是在春秋戰國時開始出現。中国历史文學則從尚書、春秋、戰國策、史記等一直延續下來,而且有很詳細的資料記錄。\\n\\n中国的文学成就最大的是诗歌,从《离骚》到唐代律诗,诗歌一直对中国文坛有着巨大的影响。后来诗、词、曲、小说等文学形式分别在唐、宋、元、明清达到高峰。民國時期由胡適和陳獨秀推動的新文學運動,認為作品不應只講求形式,應注重內容的充實、表達及情感,也推動白話文學。民国时期,武侠小说风靡海内外,成为当时最受欢迎的通俗小说。\\n\\n中华人民共和国时期,在文化大革命後,出現相關的反思文學及伤痕文学,也有一批白话文诗人进行大量创作,也取代古诗成为当时最欢迎的诗歌作品。後來網路文學興起,成為受歡迎的商業作品。\\n\\n中華民國在撤退台灣後,在50及60年代出現了以四大抗戰小說為代表的戰鬥文藝小說,都是以抗戰時期為背景,後來又有反共文學的出現,而60年代開始,以瓊瑤為代表的言情小說也開始行。70年代起逐漸開始有對於台灣社會研究的新現代文學,以及強調鄉土的鄉土寫實文學,1990年後也開始了網路文學的興起。\\n\\n\\n中国古典文學分为诗和文,文又分为韵文和散文,中国的抒情诗和文言文最早而比较发达。\\n\\n文學一般分为小說、散文、詩歌、劇本,并称为四大文学体裁;\\n\\n\\n\\n劇本是另一种古老的文学形式,主要通过不同角色之间的对话来表达作者的思想和感情。劇本可以用于舞台的表演,也可以阅读。像元曲、京剧、昆剧都屬於這個部份。西方的戲劇許多都伴隨著音樂和舞蹈,例如歌劇及音樂劇,古希腊戏剧是目前已知最早期的西方戲劇,有悲劇、喜劇、悲喜劇等。\\n\\n\\n\\n有許多的文學獎,頒發給優秀的作家,表揚其文學的成就。因為文學的範圍很廣,許多文學獎項會依風格、文學類型、語言、國籍及其他特性(例如新進作家或是等)再做分類。\\n\\n諾貝爾文學獎是依諾貝爾在1895年的遺囑所成立的獎項,是諾貝爾獎中的一項,一般是因為作者的整體作品而獲獎,而非著重特定的作品。其他不分國籍的獎項有:纽斯塔特国际文学奖、布克國際獎及卡夫卡獎。\\n\\n\\n是文學創作者應用在文學中,製造特別效果的方式。文學技巧的範圍很廣,包括作品是否要用第一人稱或是其他人稱、用傳統的線性敘事或是、或是文類選擇都包括在內。這可以讓讀者感受到一些熟悉的結構及架構,例如傳統犯罪小說,不過有些作者會特別選擇一些文學技巧來讓讀者有意外的感受。\\n\\n文學技巧的使用也可能會產生新的文類,就像塞繆爾·理查森寫的早期現代小說《》一様。《Pamela》是用許多的信件組成,稱為「書信體技巧」(epistolary technique)。因此《Pamela》讓大家再次注意到,一個以往曾出現,但沒有這麼受注意的文類。\\n\\n文學技巧和文学手段(literary device)不同,有點類似軍事戰略和軍事戰術之間的關係。文学手段是在敘述中用的特殊結構,像是隐喻、明喻、省略、敘事及託寓等,甚至單純的諧音都可以作為文学手段。也可以視為是文学手段,例如意識流敘事。\\n\\n文學批評是指文學批評者對其他人作品的評論和評估,有時也會用來改進及提昇文學作品。也可以對作者帶來類似的作用。有許多不同種類的文學批評,背後會有其理論基礎,不同種類的文學批評可以評論文學作品的各個部份或是各個層面。\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n历史\\n\\n歷史(现代汉语词汇,古典文言文称之为史),指人类社会过去的事件和行动,以及对这些事件行为有系统的记录、诠释和研究。歷史可提供今人理解過去,作為未來行事的參考依據,与伦理、哲学和艺术同属人类精神文明的重要成果。历史的第二个含'" - ] - }, - "execution_count": 141, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Machine Learning Quick Review" - ] - }, - { - "cell_type": "code", - "execution_count": 176, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10k_articles.txt lesson01-Part01.pdf\r\n", - "80k_articles.txt lesson01-Part02.pdf\r\n", - "Lecture-01-An Introduction to AI.ipynb lesson01.pdf\r\n", - "Untitled.ipynb regression_example.py\r\n", - "Untitled1.ipynb sqlResult_1558435.csv\r\n" - ] - } - ], - "source": [ - "!ls" - ] - }, - { - "cell_type": "code", - "execution_count": 180, - "metadata": {}, - "outputs": [], - "source": [ - "titanic_content = pd.read_csv(open('../../datasource/titanic_train.csv'))" - ] - }, - { - "cell_type": "code", - "execution_count": 183, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
6701McCarthy, Mr. Timothy Jmale54.0001746351.8625E46S
7803Palsson, Master. Gosta Leonardmale2.03134990921.0750NaNS
8913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female27.00234774211.1333NaNS
91012Nasser, Mrs. Nicholas (Adele Achem)female14.01023773630.0708NaNC
\n", - "
" - ], - "text/plain": [ - " PassengerId Survived Pclass \\\n", - "0 1 0 3 \n", - "1 2 1 1 \n", - "2 3 1 3 \n", - "3 4 1 1 \n", - "4 5 0 3 \n", - "5 6 0 3 \n", - "6 7 0 1 \n", - "7 8 0 3 \n", - "8 9 1 3 \n", - "9 10 1 2 \n", - "\n", - " Name Sex Age SibSp \\\n", - "0 Braund, Mr. Owen Harris male 22.0 1 \n", - "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", - "2 Heikkinen, Miss. Laina female 26.0 0 \n", - "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", - "4 Allen, Mr. William Henry male 35.0 0 \n", - "5 Moran, Mr. James male NaN 0 \n", - "6 McCarthy, Mr. Timothy J male 54.0 0 \n", - "7 Palsson, Master. Gosta Leonard male 2.0 3 \n", - "8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n", - "9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n", - "\n", - " Parch Ticket Fare Cabin Embarked \n", - "0 0 A/5 21171 7.2500 NaN S \n", - "1 0 PC 17599 71.2833 C85 C \n", - "2 0 STON/O2. 3101282 7.9250 NaN S \n", - "3 0 113803 53.1000 C123 S \n", - "4 0 373450 8.0500 NaN S \n", - "5 0 330877 8.4583 NaN Q \n", - "6 0 17463 51.8625 E46 S \n", - "7 1 349909 21.0750 NaN S \n", - "8 2 347742 11.1333 NaN S \n", - "9 0 237736 30.0708 NaN C " - ] - }, - "execution_count": 183, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "titanic_content[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 185, - "metadata": {}, - "outputs": [], - "source": [ - "titanic_content = pd.read_csv(open('../../datasource/titanic_train.csv'))\n", - "titanic_content = titanic_content.dropna()\n", - "age_with_fare = titanic_content[['Age', 'Fare']]\n", - "age_with_fare = age_with_fare[ (age_with_fare['Age'] > 22) & (age_with_fare['Fare'] < 400) & (age_with_fare['Fare'] > 130)]\n", - "age = age_with_fare['Age']\n", - "fare = age_with_fare['Fare']" - ] - }, - { - "cell_type": "code", - "execution_count": 199, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 202, - "metadata": {}, - "outputs": [], - "source": [ - "L1 = np.array([1,2, 3])" - ] - }, - { - "cell_type": "code", - "execution_count": 210, - "metadata": {}, - "outputs": [], - "source": [ - "L2 = np.mean(np.array([2, 3, 4]))" - ] - }, - { - "cell_type": "code", - "execution_count": 211, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3.0" - ] - }, - "execution_count": 211, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "L2" - ] - }, - { - "cell_type": "code", - "execution_count": 208, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1, 1, 1])" - ] - }, - "execution_count": 208, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.abs((L2 - L1)) ** 2\n", - "\n", - "def loss(y_true, yhats): return np.mean(np.abs(y_true - yhats))" - ] - }, - { - "cell_type": "code", - "execution_count": 209, - "metadata": {}, - "outputs": [], - "source": [ - "## boradcast" - ] - }, - { - "cell_type": "code", - "execution_count": 198, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 198, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFCVJREFUeJzt3X+MHOd93/H3pxTtXuOkZ4enVPwVyoHMWv4RUT2rbIWmttyEkmuYhJEAChpbUI0QDdhUCmy6pg3ESAHBbhjYiRBEgBCptlDVrmrTtJAoZRRHreGiknoUJVESw5qtHYtHOqTh0nari0zR3/6xc9LqdOTu3q+9G75fwOFmn5lZfG9u97Ozzzy7T6oKSVJ7/Y1hFyBJWlwGvSS1nEEvSS1n0EtSyxn0ktRyBr0ktZxBL0ktZ9BLUssZ9JLUcpcMuwCANWvW1KZNm4ZdhiStKAcPHvxOVY312m5ZBP2mTZuYmJgYdhmStKIk+ct+trPrRpJazqCXpJYz6CWp5Qx6SWo5g16SWm5ZjLqZi/2HJtl74CgnzkyxdnSE3ds2s2PLur7XS9IwLWVGrcig339okj37DjN19hwAk2em2LPvMAA7tqzruV6ShmmpM2pFdt3sPXD0xQM0bersOfYeONrXekkapqXOqBUZ9CfOTF2wvdd6SRqmpc6onkGfZEOSh5IcSfJ0klu61v16kqNN+293te9JcqxZt22hi147OnLB9l7rJWmYljqj+jmjfwH4YFW9EdgK7EpyZZJ3ANuBt1bVm4DfAUhyJXAj8CbgeuAPkqxayKJ3b9vMyOqX3+XI6lXs3ra5r/WSNExLnVE9L8ZW1UngZLP8gyRHgHXArwKfrKrnm3Wnml22A59v2r+R5BhwDfDfF6ro6YsV57ti3Wu9JA3TUmdUqqr/jZNNwFeBNze/v0znrP2vgQ9V1f9I8vvAw1X175t97gL+pKq+cL77HR8fL7/UTJIGk+RgVY332q7v4ZVJXgN8Ebi1qr6f5BLgtXS6c94G3Jfk9UBm2f0VryZJdgI7ATZu3NhvGZKkAfU16ibJajohf29V7WuajwP7quNR4EfAmqZ9Q9fu64ETM++zqu6sqvGqGh8b6/l1ypKkOepn1E2Au4AjVfWprlX7geuabd4AvAr4DnA/cGOSVye5HLgCeHShC5ck9aefrptrgfcBh5M83rR9FLgbuDvJU8APgZuq0+H/dJL7gGfojNjZVVXnZrlfSdIS6GfUzdeYvd8d4FfOs89twG3zqEuStEBW5CdjJUn9M+glqeUMeklqOYNeklrOoJekljPoJanlDHpJajmDXpJazqCXpJYz6CWp5Qx6SWq5vr+PfqXZf2jSGaZ0UfM5oGmtDPr9hybZs+8wU2c7X5o5eWaKPfsOA/hA10XB54C6tbLrZu+Boy8+wKdNnT3H3gNHh1SRtLR8DqhbK4P+xJmpgdqltvE5oG6tDPq1oyMDtUtt43NA3VoZ9Lu3bWZk9aqXtY2sXsXubZuHVJG0tHwOqFsrL8ZOX2xyxIEuVj4H1C2daV6Ha3x8vCYmJoZdhiStKEkOVtV4r+1a2XUjSXqJQS9JLWfQS1LL9Qz6JBuSPJTkSJKnk9wyY/2HklSSNc3tJLk9ybEkTya5erGKlyT11s+omxeAD1bVY0l+HDiY5MGqeibJBuDngW91bX8DcEXz8/eBO5rfkqQh6HlGX1Unq+qxZvkHwBFgeozWp4EPA91Dd7YD91THw8BokssWtmxJUr8G6qNPsgnYAjyS5D3AZFU9MWOzdcCzXbeP89ILgyRpifX9gakkrwG+CNxKpzvnY8AvzLbpLG2vGKyfZCewE2Djxo39liFJGlBfZ/RJVtMJ+Xurah/wM8DlwBNJvgmsBx5L8nfonMFv6Np9PXBi5n1W1Z1VNV5V42NjY/P7KyRJ59XPqJsAdwFHqupTAFV1uKourapNVbWJTrhfXVXfBu4H3t+MvtkKfK+qTi7enyBJupB+um6uBd4HHE7yeNP20ap64DzbPwC8CzgGPAfcPO8qJUlz1jPoq+przN7v3r3Npq7lAnbNuzJJ0oLwk7GS1HIGvSS1nEEvSS1n0EtSyxn0ktRyBr0ktZxBL0ktZ9BLUssZ9JLUcga9JLWcQS9JLdf399FLC2n/oUn2HjjKiTNTrB0dYfe2zezY4vw00mIw6LXk9h+aZM++w0ydPQfA5Jkp9uw7DGDYS4vArhstub0Hjr4Y8tOmzp5j74GjQ6pIajeDXkvuxJmpgdolzY9BryW3dnRkoHZJ82PQa8nt3raZkdWrXtY2snoVu7dtHlJFUrt5MVZLbvqCq6NupKVh0GsodmxZZ7BLS8SuG0lqOYNeklrOoJeklusZ9Ek2JHkoyZEkTye5pWnfm+QvkjyZ5EtJRrv22ZPkWJKjSbYt5h8gSbqwfs7oXwA+WFVvBLYCu5JcCTwIvLmq3gr8T2APQLPuRuBNwPXAHyRZNes9S5IWXc+gr6qTVfVYs/wD4Aiwrqr+tKpeaDZ7GFjfLG8HPl9Vz1fVN4BjwDULX7okqR8D9dEn2QRsAR6ZseqfA3/SLK8Dnu1ad7xpm3lfO5NMJJk4ffr0IGVIkgbQd9AneQ3wReDWqvp+V/vH6HTv3DvdNMvu9YqGqjuraryqxsfGxgarWpLUt74+MJVkNZ2Qv7eq9nW13wS8G3hnVU2H+XFgQ9fu64ETC1OuJGlQ/Yy6CXAXcKSqPtXVfj3wr4H3VNVzXbvcD9yY5NVJLgeuAB5d2LIlSf3q54z+WuB9wOEkjzdtHwVuB14NPNh5LeDhqvoXVfV0kvuAZ+h06eyqqnOz3K8kaQn0DPqq+hqz97s/cIF9bgNum0ddkqQF4idjJanlDHpJajmDXpJazqCXpJZz4pEB7T806cxIklYUg34A+w9NsmffYabOdkaLTp6ZYs++wwB9hb0vEsub/x+1lV03A9h74OiLIT9t6uw59h442nPf6ReJyTNTFC+9SOw/NLlI1WoQ/n/UZgb9AE6cmRqovdt8XiS0+Pz/qM0M+gGsHR0ZqL3bfF4ktPj8/6jNDPoB7N62mZHVL59DZWT1KnZv29xz3/m8SGjx+f9Rmxn0A9ixZR2feO9bWDc6QoB1oyN84r1v6euC3XxeJLT4/P+ozRx1M6AdW9bNaSTG9D6O6lie/P+ozfLS18gPz/j4eE1MTAy7DElaUZIcrKrxXtvZdSNJLWfQS1LLGfSS1HIGvSS1nEEvSS1n0EtSyxn0ktRyBr0ktVzPoE+yIclDSY4keTrJLU3765I8mOTrze/XNu1JcnuSY0meTHL1Yv8RkqTz6+eM/gXgg1X1RmArsCvJlcBHgK9U1RXAV5rbADcAVzQ/O4E7FrxqSVLfegZ9VZ2sqsea5R8AR4B1wHbgs81mnwV2NMvbgXuq42FgNMllC165JKkvA/XRJ9kEbAEeAX6qqk5C58UAuLTZbB3wbNdux5s2SdIQ9B30SV4DfBG4taq+f6FNZ2l7xTenJdmZZCLJxOnTp/stQ5I0oL6CPslqOiF/b1Xta5r/arpLpvl9qmk/Dmzo2n09cGLmfVbVnVU1XlXjY2Njc61fktRDP6NuAtwFHKmqT3Wtuh+4qVm+CfhyV/v7m9E3W4HvTXfxSJKWXj8Tj1wLvA84nOTxpu2jwCeB+5J8APgW8EvNugeAdwHHgOeAmxe0YknSQHoGfVV9jdn73QHeOcv2BeyaZ12SpAXiJ2MlqeUMeklqOYNeklrOoJekljPoJanlDHpJajmDXpJazqCXpJYz6CWp5Qx6SWo5g16SWs6gl6SWM+glqeUMeklqOYNeklrOoJekljPoJanlDHpJajmDXpJazqCXpJYz6CWp5Qx6SWq5nkGf5O4kp5I81dV2VZKHkzyeZCLJNU17ktye5FiSJ5NcvZjFS5J66+eM/jPA9TPafhv4raq6CvjN5jbADcAVzc9O4I6FKVOSNFc9g76qvgp8d2Yz8BPN8t8GTjTL24F7quNhYDTJZQtVrCRpcJfMcb9bgQNJfofOi8U/bNrXAc92bXe8aTs55wolSfMy14uxvwb8RlVtAH4DuKtpzyzb1mx3kGRn078/cfr06TmWIUnqZa5BfxOwr1n+T8A1zfJxYEPXdut5qVvnZarqzqoar6rxsbGxOZYhSeplrkF/AvjHzfJ1wNeb5fuB9zejb7YC36squ20kaYh69tEn+RzwdmBNkuPAx4FfBX4vySXAX9MZYQPwAPAu4BjwHHDzItQsSRpAz6Cvql8+z6q/N8u2Beyab1GSpIXjJ2MlqeUMeklqOYNeklrOoJekljPoJanlDHpJajmDXpJazqCXpJYz6CWp5Qx6SWo5g16SWs6gl6SWM+glqeUMeklqOYNeklpurpODa4ntPzTJ3gNHOXFmirWjI+zetpkdW9YNuyxJK4BBvwLsPzTJnn2HmTp7DoDJM1Ps2XcYwLCX1JNdNyvA3gNHXwz5aVNnz7H3wNEhVSRpJTHoV4ATZ6YGapekbgb9CrB2dGSgdknqZtCvALu3bWZk9aqXtY2sXsXubZuHVJGklcSLsSvA9AVXR91ImoueQZ/kbuDdwKmqenNX+68D/xJ4Afjjqvpw074H+ABwDvhXVXVgMQq/2OzYss5glzQn/ZzRfwb4feCe6YYk7wC2A2+tqueTXNq0XwncCLwJWAv8WZI3VNW5V9yrJGlJ9Oyjr6qvAt+d0fxrwCer6vlmm1NN+3bg81X1fFV9AzgGXLOA9UqSBjTXi7FvAP5RkkeS/Nckb2va1wHPdm13vGmTJA3JXC/GXgK8FtgKvA24L8nrgcyybc12B0l2AjsBNm7cOMcyJEm9zPWM/jiwrzoeBX4ErGnaN3Rttx44MdsdVNWdVTVeVeNjY2NzLEOS1Mtcg34/cB1AkjcArwK+A9wP3Jjk1UkuB64AHl2IQiVJc9PP8MrPAW8H1iQ5DnwcuBu4O8lTwA+Bm6qqgKeT3Ac8Q2fY5S5H3EjScKWTz8M1Pj5eExMTwy5DklaUJAerarzXdn4FgiS1nEEvSS3nd91oKOYzY5azbWkmHxMXZtBryc1nxixn29JMPiZ6s+tGS24+M2Y525Zm8jHR20V5Ru/bvOGaz4xZzralmXxM9HbRndFPv82bPDNF8dLbvP2HJodd2kVjPjNmOduWZvIx0dtFF/S+zRu++cyY5WxbmsnHRG8XXdeNb/OGbz4zZjnblmbyMdHbRffJ2Gs/+edMzhLq60ZH+G8fuW5JapCkheAnY8/Dt3mSLjYXXdeNb/MkXWwuuqAHJ9qW2sYh0xd2UQa9pPbwk7G9XXR99JLaxSHTvRn0klY0h0z3ZtBLWtH8ZGxvBr2kFc0h0715MVbSiuaQ6d4MekkrnkOmL8yuG0lqOYNeklquZ9AnuTvJqSRPzbLuQ0kqyZrmdpLcnuRYkieTXL0YRUuS+tfPGf1ngOtnNibZAPw88K2u5huAK5qfncAd8y9RkjQfPYO+qr4KfHeWVZ8GPgx0f8/xduCe6ngYGE1y2YJUKkmakzn10Sd5DzBZVU/MWLUOeLbr9vGmbbb72JlkIsnE6dOn51KGJKkPAwd9kr8FfAz4zdlWz9I268wmVXVnVY1X1fjY2NigZUiS+jSXcfQ/A1wOPJEEYD3wWJJr6JzBb+jadj1wYr5FSpLmbuAz+qo6XFWXVtWmqtpEJ9yvrqpvA/cD729G32wFvldVJxe2ZEnSIHqe0Sf5HPB2YE2S48DHq+qu82z+APAu4BjwHHDzAtWpIbnQhA5O9iCtDD2Dvqp+ucf6TV3LBeyaf1laDi40oQPgZA/SCuF33ei8ek3ocL51ix30vsuQBmPQ67zmMqHDYk/24LsMaXAGvc5r7egIk7ME9/SEDhdat1iW67sMaTnzS810Xhea0GFYkz1c6F2GU8pJs/OMXufVz4QOS90fvhzfZUjLXToDZYZrfHy8JiYmhl2GVoCZffTQeSfxife+BeC86+y6URslOVhV472284xeK8pyfJchLXee0UvSCtXvGb0XYyWp5Qx6SWo5g16SWs6gl6SWM+glqeWWxaibJKeBvxx2HcvEGuA7wy5ihfBY9cfj1J+VeJx+uqp6TtG3LIJeL0ky0c9wKXms+uVx6k+bj5NdN5LUcga9JLWcQb/83DnsAlYQj1V/PE79ae1xso9eklrOM3pJajmDfoiS/M0kjyZ5IsnTSX6rab88ySNJvp7kPyZ51bBrXQ6SrEpyKMkfNbc9TjMk+WaSw0keTzLRtL0uyYPNcXowyWuHXeewJRlN8oUkf5HkSJJ/0ObjZNAP1/PAdVX1s8BVwPVJtgL/Fvh0VV0B/B/gA0OscTm5BTjSddvjNLt3VNVVXUMFPwJ8pTlOX2luX+x+D/jPVfV3gZ+l87hq7XEy6IeoOv5vc3N181PAdcAXmvbPAjuGUN6ykmQ98E+BP2xuB49Tv7bTOT7gcSLJTwA/B9wFUFU/rKoztPg4GfRD1nRHPA6cAh4E/hdwpqpeaDY5DjhzBvwu8GHgR83tn8TjNJsC/jTJwSQ7m7afqqqTAM3vS4dW3fLweuA08O+arsA/TPJjtPg4GfRDVlXnquoqYD1wDfDG2TZb2qqWlyTvBk5V1cHu5lk2vaiPU+PaqroauAHYleTnhl3QMnQJcDVwR1VtAf4fLeqmmY1Bv0w0bx3/C7AVGE0yPc3jeuDEsOpaJq4F3pPkm8Dn6XTZ/C4ep1eoqhPN71PAl+icPPxVkssAmt+nhlfhsnAcOF5VjzS3v0An+Ft7nAz6IUoylmS0WR4B/gmdi0IPAb/YbHYT8OXhVLg8VNWeqlpfVZuAG4E/r6p/hsfpZZL8WJIfn14GfgF4CrifzvEBjxNV9W3g2SSbm6Z3As/Q4uPkB6aGKMlb6Vz0WUXnRfe+qvo3SV5P58z1dcAh4Feq6vnhVbp8JHk78KGqerfH6eWa4/Gl5uYlwH+oqtuS/CRwH7AR+BbwS1X13SGVuSwkuYrOhf1XAf8buJnmOUgLj5NBL0ktZ9eNJLWcQS9JLWfQS1LLGfSS1HIGvSS1nEEvSS1n0EtSyxn0ktRy/x8ZuDYH17yb9wAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.scatter(age, fare)" - ] } ], "metadata": { diff --git a/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb b/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb index 0155d67..9a7478c 100644 --- a/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb +++ b/2018-autumn/Lecture-2-Wikipedia-Smoothing-ustccheng02.ipynb @@ -42,12 +42,32 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "MemoryError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mall_wiki_content\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'D://pyproject//git//AI-NLP//data//text//AA//wiki_00'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'UTF-8'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mall_wiki_content\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msub\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mr'<[^>]+>'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m''\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mall_wiki_content\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 去掉 tag\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mD:\\Anaconda3\\lib\\codecs.py\u001b[0m in \u001b[0;36mdecode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[1;31m# decode input (taking the buffer into account)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m \u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsumed\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_buffer_decode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfinal\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 322\u001b[0m \u001b[1;31m# keep undecoded input until the next call\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 323\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuffer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mconsumed\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mMemoryError\u001b[0m: " + ] + } + ], "source": [ "all_wiki_content = open('D://pyproject//git//AI-NLP//data//text//AA//wiki_00',encoding='UTF-8').read()\n", "all_wiki_content = re.sub(r'<[^>]+>','',all_wiki_content) # 去掉 tag" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 在8g内存的电脑上能跑完,但在内存小的电脑上报MemeoryError" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -386,46 +406,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "### 2-gram 和 smoothing 并没能很好的区分这些语句。感觉问题主要是上下文的距离超过了2-gram的长度。分词后,再建立3-gram模型,或许能够区分开来。" + ] } ], "metadata": {