Archive for November, 2012

Why find html5 parser when you have a web browser?

So, I was wanting to scrape some content from a website that I have an offline archive of. It turns out that’s pretty easy to do using jQuery. Do an ajax get, and then use jQuery on the results. Neat.

<doctype html>
<html>
	<head>
		<title></title>
		<script type="text/javascript" src="http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.8.2.min.js" >
		</script>
	</head>
	<body>
		<textarea id="mytextarea" rows="20" cols="50">
		</textarea>
		<script type="text/javascript">
		$(function() {
			var $textarea = $('#mytextarea');
			$textarea.text('hello world');
			
			// I'm a little surprised this is this easy
			var doScrape = function (url, sessionId){
				$.get(url, function(data){
					var title = $('#ctl00_ContentPlaceHolder1_ListView1_ctrl0_TitleLabel', data).html();
					var speakerName = $('#ctl00_ContentPlaceHolder1_ListView1_ctrl0_speakersDataList_ctl00_HyperLink1', data).text();
					// turn "socalcodecamp/presenters.aspx#JeremyClark" into 'JeremyClark'
					var speakerIdentifier = (/#[a-zA-Z]+$/.exec(speakerIdentifierLink) || [""])[0].replace("#","");
					var sessionDateTime = $('#ctl00_ContentPlaceHolder1_ListView1_ctrl0_ListView2_ctrl0_StartLabel', data).text();
					var tags = [];
					$('#ctl00_ContentPlaceHolder1_ListView1_ctrl0_DataList1 a', data).each(function(){tags[tags.length]=$(this).text();});
					var room = $('#ctl00_ContentPlaceHolder1_ListView1_ctrl0_ListView3_ctrl0_RoomTitleLabel', data).text();
					var description = '';
					$('#ctl00_ContentPlaceHolder1_ListView1_ctrl0_DescriptionLabel', data).parent('p').nextAll().each(function() {
						description += $(this)[0].outerHTML;
					});

					sessions[sessions.length] = {
						title:title, 
						speakerName: speakerName,
						speakerIdentifier:speakerIdentifier,
						sessionDateTime:sessionDateTime,
						tags:tags,
						room:room,
						description:description
					};
					$textarea.text(JSON.stringify(sessions));
				});
			};

			
			doScrape("session.aspx%3Fsid=fabd2dad-e502-44cd-9590-55ffc182cea4","fabd2dad-e502-44cd-9590-55ffc182cea4");
		});
		</script>
	</body>
</html>
Advertisements

Leave a comment