-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathhtml-parser-utils.js
More file actions
107 lines (93 loc) · 3.62 KB
/
html-parser-utils.js
File metadata and controls
107 lines (93 loc) · 3.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
var cheerio = require("cheerio");
var unique = require("array-unique");
const LINKS_LIMIT = 20;
module.exports = function(htmlStr, domain) {
var self = this;
replaceAbsoluteLinkReferences(); // remove absolute references right away
/**
* Finds all the links in an html string
* @param {string} htmlStr html data
* @return {array} list of links that were found
*/
this.extractLinks = function() {
// Use cheerio
var $ = cheerio.load(htmlStr);
var navLinks = prepareExtractedLinks($, "nav a");
if (navLinks.length === 0) return prepareExtractedLinks($, "a");
return navLinks;
};
/**
* helper function, takes selctor and compiles links with that selector. easy reuse for trying to get links within navs,
* then if that is empty just resrot to first 10 other links
* @param {[type]} jqSelector [description]
* @return {[type]} [description]
*/
function prepareExtractedLinks ($, jqSelector) {
var linkQueue = [];
var domainNoSlash = (domain[domain.length-1] === "/") ? domain.substring(0, domain.length-1) : domain;
$(jqSelector).each(function (i, link) {
if (i > LINKS_LIMIT) return false; // break after 10
var linkUrl = $(link).attr('href');
// make sure relative link
if (linkUrl == null) return
if (linkUrl[0] === '/') {
linkQueue.push(domainNoSlash + linkUrl);
}
if (linkUrl.indexOf("http:") === -1 && linkUrl.indexOf("https:") === -1 && linkUrl[0] != "/") linkUrl = domainNoSlash + "/"+linkUrl;
if (self.getRootDomain(domain) === self.getRootDomain(linkUrl)) linkQueue.push(linkUrl);
});
return unique(linkQueue).slice(0, 10);
}
function replaceAbsoluteLinkReferences () {
htmlStr = htmlStr.replace(new RegExp(domain, 'g'), "");
}
/**
* Gets list of emails from the html
* @return {[type]} [description]
*/
this.extractEmails = function () {
var emails = htmlStr.match(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+)/gi) || [];
return unique(emails);
}
/**
* @see https://stackoverflow.com/questions/8498592/extract-hostname-name-from-string
* @param {string} url domain url
* @return {} [description]
*/
this.getRootDomain = function (linkUrl) {
var hostDomain = extractHostname(linkUrl),
splitArr = hostDomain.split('.'),
arrLen = splitArr.length;
//extracting the root domain here
//if there is a subdomain
if (arrLen > 2) {
hostDomain = splitArr[arrLen - 2] + '.' + splitArr[arrLen - 1];
//check to see if it's using a Country Code Top Level Domain (ccTLD) (i.e. ".me.uk")
if (splitArr[arrLen - 1].length == 2 && splitArr[arrLen - 1].length == 2) {
//this is using a ccTLD
hostDomain = splitArr[arrLen - 3] + '.' + hostDomain;
}
}
return hostDomain;
}
/**
* @see https://stackoverflow.com/questions/8498592/extract-hostname-name-from-string
* @param {string} linkUrl domain url
* @return {string} root of domain
*/
function extractHostname(linkUrl) {
var hostname;
//find & remove protocol (http, ftp, etc.) and get hostname
if (linkUrl.indexOf("://") > -1) {
hostname = linkUrl.split('/')[2];
}
else {
hostname = linkUrl.split('/')[0];
}
//find & remove port number
hostname = hostname.split(':')[0];
//find & remove "?"
hostname = hostname.split('?')[0];
return hostname;
}
}