a blog for those who code

Thursday 21 January 2016

How to parse HTML data in Node.js

In this post we will be discussing about parsing HTML data in Node.js. We will be using htmlparser2 for parsing the HTML data. Htmlparser2 is a forgiving HTML/XML/RSS parser. The parser can handle streams and provides a callback interface. There are a lot of alternatives libraries that can accomplish the same.

We will be writing a code where we will parse the HTML of CodingDefined.com website and get all the tags used with count. In the below code at first we are using HTTP object to get all the HTML data and then using the htmlparser2 we are figuring out what tag names exist and how many of each tag name exist. We are displaying the tags with the count in the console.

Code : 

var http = require('http');
var htmlparser = require('htmlparser2');

var url = 'http://www.codingdefined.com'

http.get(url, function(response) {
  parseResponse(response);
})

var parseResponse = function(response) {
  var data = "";
  response.on('data', function(chunk) {
    data += chunk;
  });
  var tags = [];
  var tagsCount = {};
  var tagsWithCount = [];
  response.on('end', function(chunk) {
    var parsedData = new htmlparser.Parser({
     onopentag: function(name, attribs) {
      if(tags.indexOf(name) === -1) {
       tags.push(name);
tagsCount[name] = 1;
       } else {
tagsCount[name]++;
       }
     },
     onend: function() {
      for(var i = 1;i < tags.length;i++) {
       tagsWithCount.push({name:tags[i], count:tagsCount[tags[i]]});
     }
    }
   }, {decodeEntities: true});
   parsedData.write(data);
   parsedData.end();
   console.log(tagsWithCount);
  });
}

The above code will give us an output as


Please Like and Share the CodingDefined.com Blog, if you find it interesting and helpful.

No comments:

Post a Comment