In this tutorial, we will show you how to extract hyperlink from a HTML page. For example, to get the link from following content :
this is text1 <a href='mkyong.com' target='_blank'>hello</a> this is text2...
- First get the “value” from
a
tag – Result :a href='mkyong.com' target='_blank'
- Later get the “link” from above extracted value – Result :
mkyong.com
1. Regular Expression Pattern
Extract A tag Regular Expression Pattern
(?i)<a([^>]+)>(.+?)</a>
Extract Link From A tag Regular Expression Pattern
\s*(?i)href\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+));
Description
( #start of group #1 ?i # all checking are case insensive ) #end of group #1 <a #start with "<a" ( # start of group #2 [^>]+ # anything except (">"), at least one character ) # end of group #2 > # follow by ">" (.+?) # match anything </a> # end with "</a>
\s* #can start with whitespace (?i) # all checking are case insensive href # follow by "href" word \s*=\s* # allows spaces on either side of the equal sign, ( # start of group #1 "([^"]*") # allow string with double quotes enclosed - "string" | # ..or '[^']*' # allow string with single quotes enclosed - 'string' | # ..or ([^'">]+) # can't contains one single quotes, double quotes ">" ) # end of group #1
2. Java Link Extractor Example
Here’s a simple Java Link extractor example, to extract the
a
tag value from 1st pattern, and use 2nd pattern to extract the link from 1st pattern.
HTMLLinkExtractor.java
package com.mkyong.crawler.core; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; public class HTMLLinkExtractor { private Pattern patternTag, patternLink; private Matcher matcherTag, matcherLink; private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>"; private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))"; public HTMLLinkExtractor() { patternTag = Pattern.compile(HTML_A_TAG_PATTERN); patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN); } /** * Validate html with regular expression * * @param html * html content for validation * @return Vector links and link text */ public Vector<HtmlLink> grabHTMLLinks(final String html) { Vector<HtmlLink> result = new Vector<HtmlLink>(); matcherTag = patternTag.matcher(html); while (matcherTag.find()) { String href = matcherTag.group(1); // href String linkText = matcherTag.group(2); // link text matcherLink = patternLink.matcher(href); while (matcherLink.find()) { String link = matcherLink.group(1); // link HtmlLink obj = new HtmlLink(); obj.setLink(link); obj.setLinkText(linkText); result.add(obj); } } return result; } class HtmlLink { String link; String linkText; HtmlLink(){}; @Override public String toString() { return new StringBuffer("Link : ").append(this.link) .append(" Link Text : ").append(this.linkText).toString(); } public String getLink() { return link; } public void setLink(String link) { this.link = replaceInvalidChar(link); } public String getLinkText() { return linkText; } public void setLinkText(String linkText) { this.linkText = linkText; } private String replaceInvalidChar(String link){ link = link.replaceAll("'", ""); link = link.replaceAll("\"", ""); return link; } } }
3. Unit Test
Unit test with TestNG. Simulate the HTML content via
@DataProvider
.
TestHTMLLinkExtractor.java
package com.mkyong.crawler.core; import java.util.Vector; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import com.mkyong.crawler.core.HTMLLinkExtractor.HtmlLink; /** * HTML link extrator Testing * * @author mkyong * */ public class TestHTMLLinkExtractor { private HTMLLinkExtractor htmlLinkExtractor; String TEST_LINK = "http://www.google.com"; @BeforeClass public void initData() { htmlLinkExtractor = new HTMLLinkExtractor(); } @DataProvider public Object[][] HTMLContentProvider() { return new Object[][] { new Object[] { "abc hahaha <a href='" + TEST_LINK + "'>google</a>" }, new Object[] { "abc hahaha <a HREF='" + TEST_LINK + "'>google</a>" }, new Object[] { "abc hahaha <A HREF='" + TEST_LINK + "'>google</A> , " + "abc hahaha <A HREF='" + TEST_LINK + "' target='_blank'>google</A>" }, new Object[] { "abc hahaha <A HREF='" + TEST_LINK + "' target='_blank'>google</A>" }, new Object[] { "abc hahaha <A target='_blank' HREF='" + TEST_LINK + "'>google</A>" }, new Object[] { "abc hahaha <A target='_blank' HREF=\"" + TEST_LINK + "\">google</A>" }, new Object[] { "abc hahaha <a HREF=" + TEST_LINK + ">google</a>" }, }; } @Test(dataProvider = "HTMLContentProvider") public void ValidHTMLLinkTest(String html) { Vector<HtmlLink> links = htmlLinkExtractor.grabHTMLLinks(html); //there must have something Assert.assertTrue(links.size() != 0); for (int i = 0; i < links.size(); i++) { HtmlLink htmlLinks = links.get(i); //System.out.println(htmlLinks); Assert.assertEquals(htmlLinks.getLink(), TEST_LINK); } } }
Result
[TestNG] Running: /private/var/folders/w8/jxyz5pf51lz7nmqm_hv5z5br0000gn/T/testng-eclipse--530204890/testng-customsuite.xml PASSED: ValidHTMLLinkTest("abc hahaha <a href='http://www.google.com'>google</a>") PASSED: ValidHTMLLinkTest("abc hahaha <a HREF='http://www.google.com'>google</a>") PASSED: ValidHTMLLinkTest("abc hahaha <A HREF='http://www.google.com'>google</A> , abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>") PASSED: ValidHTMLLinkTest("abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>") PASSED: ValidHTMLLinkTest("abc hahaha <A target='_blank' HREF='http://www.google.com'>google</A>") PASSED: ValidHTMLLinkTest("abc hahaha <A target='_blank' HREF="http://www.google.com">google</A>") PASSED: ValidHTMLLinkTest("abc hahaha <a HREF=http://www.google.com>google</a>")