<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Standard Deviations &#187; Hadoop</title>
	<atom:link href="http://parand.com/say/index.php/category/hadoop/feed/" rel="self" type="application/rss+xml" />
	<link>http://parand.com/say</link>
	<description>Parand Tony Darugar: A Cruel and Petty Dictator</description>
	<lastBuildDate>Wed, 11 Jan 2012 20:33:20 +0000</lastBuildDate>
	<generator>http://wordpress.org/?v=2.8.4</generator>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
			<item>
		<title>San Diego Hadoop User Group Starting</title>
		<link>http://parand.com/say/index.php/2009/01/04/san-diego-hadoop-user-group-starting/</link>
		<comments>http://parand.com/say/index.php/2009/01/04/san-diego-hadoop-user-group-starting/#comments</comments>
		<pubDate>Mon, 05 Jan 2009 07:35:31 +0000</pubDate>
		<dc:creator>Parand</dc:creator>
				<category><![CDATA[Hadoop]]></category>

		<guid isPermaLink="false">http://parand.com/say/?p=736</guid>
		<description><![CDATA[We&#8217;re starting a San Diego Hadoop User Group. Please send me an email ( darugar at gmail ) if you&#8217;re interested or leave a comment here. Details to be worked out, but we&#8217;ll likely will have the first meeting towards the end of February.
]]></description>
			<content:encoded><![CDATA[<p>We&#8217;re starting a San Diego Hadoop User Group. Please send me an email ( darugar at gmail ) if you&#8217;re interested or leave a comment here. Details to be worked out, but we&#8217;ll likely will have the first meeting towards the end of February.</p>
]]></content:encoded>
			<wfw:commentRss>http://parand.com/say/index.php/2009/01/04/san-diego-hadoop-user-group-starting/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Cloud Computing Hadoop Slides</title>
		<link>http://parand.com/say/index.php/2008/11/20/cloud-computing-hadoop-slides/</link>
		<comments>http://parand.com/say/index.php/2008/11/20/cloud-computing-hadoop-slides/#comments</comments>
		<pubDate>Thu, 20 Nov 2008 23:34:11 +0000</pubDate>
		<dc:creator>Parand</dc:creator>
				<category><![CDATA[Hadoop]]></category>

		<guid isPermaLink="false">http://parand.com/say/?p=722</guid>
		<description><![CDATA[Slides from my Data Processing in the Cloud talk:
Cloud Computing: Hadoop
View SlideShare presentation or Upload your own. (tags: pig cloud)

]]></description>
			<content:encoded><![CDATA[<p>Slides from my <a href="http://dataservicesworld.sys-con.com/general/session1108.htm?id=95">Data Processing in the Cloud talk</a>:</p>
<div id="__ss_773531" style="width: 425px; text-align: left;"><a style="font:14px Helvetica,Arial,Sans-serif;display:block;margin:12px 0 3px 0;text-decoration:underline;" title="Cloud Computing: Hadoop" href="http://www.slideshare.net/darugar/cloud-computing-hadoop-presentation?type=powerpoint">Cloud Computing: Hadoop</a><object classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" width="425" height="355" codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0"><param name="allowFullScreen" value="true" /><param name="allowScriptAccess" value="always" /><param name="src" value="http://static.slideshare.net/swf/ssplayer2.swf?doc=cloud-computing-hadoop-1227222647370363-8&amp;stripped_title=cloud-computing-hadoop-presentation" /><embed type="application/x-shockwave-flash" width="425" height="355" src="http://static.slideshare.net/swf/ssplayer2.swf?doc=cloud-computing-hadoop-1227222647370363-8&amp;stripped_title=cloud-computing-hadoop-presentation" allowscriptaccess="always" allowfullscreen="true"></embed></object></p>
<div style="font-size:11px;font-family:tahoma,arial;height:26px;padding-top:2px;">View SlideShare <a style="text-decoration:underline;" title="View Cloud Computing: Hadoop on SlideShare" href="http://www.slideshare.net/darugar/cloud-computing-hadoop-presentation?type=powerpoint">presentation</a> or <a style="text-decoration:underline;" href="http://www.slideshare.net/upload?type=powerpoint">Upload</a> your own. (tags: <a style="text-decoration:underline;" href="http://slideshare.net/tag/pig">pig</a> <a style="text-decoration:underline;" href="http://slideshare.net/tag/cloud">cloud</a>)</div>
</div>
]]></content:encoded>
			<wfw:commentRss>http://parand.com/say/index.php/2008/11/20/cloud-computing-hadoop-slides/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Python Scripts For Dumping Oracle Data And Loading Onto Hadoop DFS</title>
		<link>http://parand.com/say/index.php/2008/10/22/python-scripts-for-dumping-oracle-data-and-loading-onto-hadoop-dfs/</link>
		<comments>http://parand.com/say/index.php/2008/10/22/python-scripts-for-dumping-oracle-data-and-loading-onto-hadoop-dfs/#comments</comments>
		<pubDate>Wed, 22 Oct 2008 18:00:51 +0000</pubDate>
		<dc:creator>Parand</dc:creator>
				<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[Python]]></category>

		<guid isPermaLink="false">http://parand.com/say/?p=703</guid>
		<description><![CDATA[There have been several requests for this, so I might as well post it here for general use. I put together a simple system for dumping data out of Oracle databases and loading onto Hadoop DFS. The slightly interesting part is the parallelism &#8211; Python&#8217;s Processing library is used to dump partitions in parallel and [...]]]></description>
			<content:encoded><![CDATA[<p>There have been several requests for this, so I might as well post it here for general use. I put together a simple system for dumping data out of Oracle databases and loading onto Hadoop DFS. The slightly interesting part is the parallelism &#8211; <a href="http://pyprocessing.berlios.de/" target="_blank">Python&#8217;s Processing library</a> is used to dump partitions in parallel and copy and load them onto DFS in parallel. This helps when dumping large amounts of data from partitioned Oracle tables.</p>
<p>The database interaction is handled by <a href="/say/misc/db.py">db.py</a> . There are a couple of helper functions for finding table partitions, etc. DBDumper dumps the requested fields from the requested table:</p>
<pre><code lang="python">
dumper = db.DBDumper('username/password@yourhost:9999/DB', 'table_name',
      ('field1', 'field2', 'field3'), 'owner', 'partition', 'output_dir', 10)
dumper.dump(cp)

</code></pre>
<p>Where 10 is the level of concurrency, <em>owner</em> is the owner of the table, and <em>partition</em> is the name of the partitions you&#8217;re interested in (can be None).</p>
<p><a href="/say/misc/dfs.py">dfs.py</a> copies the dumped files over in parallel, again using PyProcessing. It&#8217;s simply a wrapper around &#8220;cat | ssh | hadoop dfs -put&#8221;. </p>
<p>DBDumper and dfs are tied together via a callback &#8211; when each partition is dumped, the callback is invoked, triggering the dfs copy. </p>
<p>Here&#8217;s a complete example of using these to dump and copy data:</p>
<pre><code lang="python">
import db
import dfs

fs = dfs.RemoteDFS('address.of.remote.machine')

def cp(arg):
    print "CALLBACK:", arg
    fs.cp(arg[0], '/some/directory/' + arg[1] + '/' + arg[2])

dumper = db.DBDumper('username/password@yourhost:9999/DB', 'table_name',
     ('field1', 'field2', 'field3'), 'owner', 'partition', 'output_dir', 10)
dumper.dump(cp)
</code></pre>
]]></content:encoded>
			<wfw:commentRss>http://parand.com/say/index.php/2008/10/22/python-scripts-for-dumping-oracle-data-and-loading-onto-hadoop-dfs/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
		</item>
		<item>
		<title>Speaking at DataServices World on Hadoop</title>
		<link>http://parand.com/say/index.php/2008/10/08/speaking-at-dataservices-world-on-hadoop/</link>
		<comments>http://parand.com/say/index.php/2008/10/08/speaking-at-dataservices-world-on-hadoop/#comments</comments>
		<pubDate>Wed, 08 Oct 2008 17:52:21 +0000</pubDate>
		<dc:creator>Parand</dc:creator>
				<category><![CDATA[Hadoop]]></category>

		<guid isPermaLink="false">http://parand.com/say/?p=685</guid>
		<description><![CDATA[I&#8217;m giving a talk on Data Processing In The Cloud on November 20th 2008 at DataServices World in the Fairmont in San Jose. I hope to see you there. Here&#8217;s the abstract:
Hadoop, an open source implementation of map/reduce, has garnered tremendous momentum in large scale data processing, marting, and on occasion warehousing. This session will [...]]]></description>
			<content:encoded><![CDATA[<p>I&#8217;m giving a talk on <a href="http://dataservicesworld.sys-con.com/general/session1108.htm?id=95" target="_blank">Data Processing In The Cloud</a> on November 20th 2008 at DataServices World in the Fairmont in San Jose. I hope to see you there. Here&#8217;s the abstract:</p>
<p>Hadoop, an open source implementation of map/reduce, has garnered tremendous momentum in large scale data processing, marting, and on occasion warehousing. This session will examine:</p>
<ul>
<li>The current state and industry adoption of Hadoop and cloud-based data processing</li>
<li>The programming model, capabilities, common patterns, and best-practices for Hadoop deployment and usage</li>
<li>The ecology of value-add technologies and services in the grid computing and data processing world</li>
<li>Models for using grid-based data processing alongside traditional technologies and techniques.</li>
</ul>
<p>I&#8217;ll also be participating in UCSD Hackweek coming up in about 2 weeks.</p>
]]></content:encoded>
			<wfw:commentRss>http://parand.com/say/index.php/2008/10/08/speaking-at-dataservices-world-on-hadoop/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
		</item>
		<item>
		<title>Happy: Hadoop with Python (Jython)</title>
		<link>http://parand.com/say/index.php/2008/09/24/happy-hadoop-with-python-jython/</link>
		<comments>http://parand.com/say/index.php/2008/09/24/happy-hadoop-with-python-jython/#comments</comments>
		<pubDate>Wed, 24 Sep 2008 17:58:00 +0000</pubDate>
		<dc:creator>Parand</dc:creator>
				<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[Python]]></category>
		<category><![CDATA[Scaling]]></category>

		<guid isPermaLink="false">http://parand.com/say/?p=679</guid>
		<description><![CDATA[The Freebase folks have open sourced their Python (Jython) based Hadoop framework, calling it Happy. Looks interesting, will need to give it a whirl when I get a chance.
]]></description>
			<content:encoded><![CDATA[<p>The <a href="http://www.freebase.com/" target="_blank">Freebase</a> folks have open sourced their Python (Jython) based Hadoop framework, calling it <a href="http://code.google.com/p/happy/" target="_blank">Happy</a>. Looks interesting, will need to give it a whirl when I get a chance.</p>
]]></content:encoded>
			<wfw:commentRss>http://parand.com/say/index.php/2008/09/24/happy-hadoop-with-python-jython/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Pig (Hadoop) Commands And Sample Results</title>
		<link>http://parand.com/say/index.php/2008/06/19/pig-hadoop-commands-and-sample-results/</link>
		<comments>http://parand.com/say/index.php/2008/06/19/pig-hadoop-commands-and-sample-results/#comments</comments>
		<pubDate>Thu, 19 Jun 2008 18:22:20 +0000</pubDate>
		<dc:creator>Parand</dc:creator>
				<category><![CDATA[Hadoop]]></category>
		<category><![CDATA[Uncategorized]]></category>
		<category><![CDATA[Pig]]></category>

		<guid isPermaLink="false">http://parand.com/say/?p=592</guid>
		<description><![CDATA[I find seeing the results of Pig commands on sample data a good companion to the PigLatin language reference, so I setup some simple sample data and ran commands, capturing the results.Here&#8217;s the sample data as well as the commands:
/data/one:

a	A	1
b	B	2
c	C	3
a	AA	11
a	AAA	111
b	BB	22


/data/two:

x	X	a
y	Y	b
x	XX	b
z	Z	c


Pig commands and their results:

one = load 'data/one' using PigStorage();
two = load 'data/two' using PigStorage();

generated = [...]]]></description>
			<content:encoded><![CDATA[<p>I find seeing the results of <a target="_blank" href="http://incubator.apache.org/pig/">Pig</a> commands on sample data a good companion to the <a target="_blank" href="http://wiki.apache.org/pig/PigLatin">PigLatin language reference</a>, so I setup some simple sample data and ran commands, capturing the results.Here&#8217;s the sample data as well as the commands:</p>
<p><strong>/data/one:</strong></p>
<pre><code>
a	A	1
b	B	2
c	C	3
a	AA	11
a	AAA	111
b	BB	22
</code></pre>
<p></p>
<p><strong>/data/two:</strong></p>
<pre><code>
x	X	a
y	Y	b
x	XX	b
z	Z	c
</code></pre>
<p></p>
<p><strong>Pig commands and their results:</strong></p>
<pre><code>
one = load 'data/one' using PigStorage();
two = load 'data/two' using PigStorage();

generated = FOREACH one GENERATE $0, $2;
(a, 1)
(b, 2)
(c, 3)
(a, 11)
(a, 111)
(b, 22)

grouped = GROUP one BY $0;
(a, {(a, A, 1), (a, AA, 11), (a, AAA, 111)})
(b, {(b, B, 2), (b, BB, 22)})
(c, {(c, C, 3)})

grouped2 = GROUP one BY ($0, $1);
((a, A), {(a, A, 1)})
((a, AA), {(a, AA, 11)})
((a, AAA), {(a, AAA, 111)})
((b, B), {(b, B, 2)})
((b, BB), {(b, BB, 22)})
((c, C), {(c, C, 3)})

summed = FOREACH grouped GENERATE group, SUM(one.$2);
(a, 123.0)
(b, 24.0)
(c, 3.0)

counted = FOREACH grouped GENERATE group, COUNT(one);
(a, 3)
(b, 2)
(c, 1)

flat = FOREACH grouped GENERATE FLATTEN(one);
(a, A, 1)
(a, AA, 11)
(a, AAA, 111)
(b, B, 2)
(b, BB, 22)
(c, C, 3)

cogrouped = COGROUP one BY $0, two BY $2;
(a, {(a, A, 1), (a, AA, 11), (a, AAA, 111)}, {(x, X, a)})
(b, {(b, B, 2), (b, BB, 22)}, {(y, Y, b), (x, XX, b)})
(c, {(c, C, 3)}, {(z, Z, c)})

flatc = FOREACH cogrouped GENERATE FLATTEN(one.($0,$2)), FLATTEN(two.$1);
(a, 1, X)
(a, 11, X)
(a, 111, X)
(b, 2, Y)
(b, 22, Y)
(b, 2, XX)
(b, 22, XX)
(c, 3, Z)

joined = JOIN one BY $0, two BY $2;
(a, A, 1, x, X, a)
(a, AA, 11, x, X, a)
(a, AAA, 111, x, X, a)
(b, B, 2, y, Y, b)
(b, BB, 22, y, Y, b)
(b, B, 2, x, XX, b)
(b, BB, 22, x, XX, b)
(c, C, 3, z, Z, c)

crossed = CROSS one, two;
(a, AA, 11, z, Z, c)
(a, AA, 11, x, XX, b)
(a, AA, 11, y, Y, b)
(a, AA, 11, x, X, a)
(c, C, 3, z, Z, c)
(c, C, 3, x, XX, b)
(c, C, 3, y, Y, b)
(c, C, 3, x, X, a)
(b, BB, 22, z, Z, c)
(b, BB, 22, x, XX, b)
(b, BB, 22, y, Y, b)
(b, BB, 22, x, X, a)
(a, AAA, 111, x, XX, b)
(b, B, 2, x, XX, b)
(a, AAA, 111, z, Z, c)
(b, B, 2, z, Z, c)
(a, AAA, 111, y, Y, b)
(b, B, 2, y, Y, b)
(b, B, 2, x, X, a)
(a, AAA, 111, x, X, a)
(a, A, 1, z, Z, c)
(a, A, 1, x, XX, b)
(a, A, 1, y, Y, b)
(a, A, 1, x, X, a)

SPLIT one INTO one_under IF $2 < 10, one_over IF $2 >= 10;
-- one_under:
(a, A, 1)
(b, B, 2)
(c, C, 3)

</code></pre>
]]></content:encoded>
			<wfw:commentRss>http://parand.com/say/index.php/2008/06/19/pig-hadoop-commands-and-sample-results/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
		</item>
		<item>
		<title>Hadoop Is the Linux of Data Processing</title>
		<link>http://parand.com/say/index.php/2008/04/09/hadoop-is-the-linux-of-data-processing/</link>
		<comments>http://parand.com/say/index.php/2008/04/09/hadoop-is-the-linux-of-data-processing/#comments</comments>
		<pubDate>Thu, 10 Apr 2008 05:33:48 +0000</pubDate>
		<dc:creator>Parand</dc:creator>
				<category><![CDATA[Hadoop]]></category>

		<guid isPermaLink="false">http://parand.com/say/index.php/2008/04/09/hadoop-is-the-linux-of-data-processing/</guid>
		<description><![CDATA[Mentioned in passing by Roberto today. Sounds about right to me.
]]></description>
			<content:encoded><![CDATA[<p>Mentioned in passing by Roberto today. Sounds about right to me.</p>
]]></content:encoded>
			<wfw:commentRss>http://parand.com/say/index.php/2008/04/09/hadoop-is-the-linux-of-data-processing/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>

