More on GridFlow for Pure Data

Further to the previous post on GridFlow in Pure Data, I include here the installation steps for GridFlow. The Pd Extended package does not include GridFlow. After I download the file from the GridFlow website, I unzip it into the gridflow-9.13 folder. To install, I copy the inside gridflow folder to the following location:

/Library/Pd/

Within Pure Data preference, I add the path and startup option for GridFlow as shown in the following diagrams.

GridFlow path
Startup Option

To continue from the previous post with the integration of GEM and GridFlow, I redo one of my Processing work of slit-scan image in Pure Data. The GridFlow package comes with a similar example.

The source patch of the slit-scan example is:

Slit-scan example

OpenCL Particles System Example in Processing

The second example is adopted from the Million Particles example from the MSAOpenCL library. I developed it with Processing 2.0 alpha version without using the library and the Pointer class. All the memory buffers are standard Java.nio.Buffer. The performance is very acceptable with one million particles in the ATI Radeon HD 4670 graphics card.
 

 
The Processing source

import processing.opengl.*;
import javax.media.opengl.*;
import javax.media.opengl.glu.GLU;
import java.nio.FloatBuffer;
import java.nio.ByteBuffer;
 
import com.nativelibs4java.opencl.*;
import com.nativelibs4java.opencl.CLMem.Usage;
 
final int PARTICLES_COUNT = 1000000;
float halfWidth, halfHeight;
 
GL2 gl;
PGL pgl;
 
int [] vbo = new int[1];
 
CLContext context;
CLQueue queue;
CLKernel kernel;
 
CLBuffer<Float> partMem;
FloatBuffer partBuf;
 
CLBuffer<Byte> posMem;
ByteBuffer partPos;
 
PVector mousePos;
 
void setup() {
  size(screenWidth, screenHeight, OPENGL);
  background(0);
  randomSeed(millis());
  halfWidth = width/2;
  halfHeight = height/2;
  mousePos = new PVector(float(mouseX) - halfWidth, halfHeight - float(mouseY));
 
  PGraphicsOpenGL pg = (PGraphicsOpenGL) g;
  pgl = pg.beginPGL();
  gl = pgl.gl.getGL().getGL2();
  gl.glClearColor(0, 0, 0, 1);
  gl.glClear(GL.GL_COLOR_BUFFER_BIT);
  gl.glEnable(GL2.GL_POINT_SMOOTH);
  gl.glPointSize(2f);
 
  initOpenCL();
  pg.endPGL();
}
 
void initOpenCL() {
  context = JavaCL.createContextFromCurrentGL();
  queue = context.createDefaultQueue();
 
  partBuf = FloatBuffer.allocate(PARTICLES_COUNT * 4);
  partPos = ByteBuffer.allocateDirect(PARTICLES_COUNT * 2 * Float.SIZE/8).order(context.getByteOrder());
  FloatBuffer tmpPos = partPos.asFloatBuffer();
 
  for (int i = 0; i < PARTICLES_COUNT; i++) {
 
    partBuf.put(0.0f);
    partBuf.put(0.0f);
    partBuf.put(random(0.2, 2.0));
    partBuf.put(0.0f);
 
    tmpPos.put(random(width));
    tmpPos.put(random(height));
  }
 
  partBuf.rewind();
  partPos.rewind();
 
  gl.glGenBuffers(1, vbo, 0);
  gl.glBindBuffer(GL.GL_ARRAY_BUFFER, vbo[0]);
 
  gl.glBufferData(GL.GL_ARRAY_BUFFER, (int) partPos.capacity(), partPos, GL2.GL_DYNAMIC_COPY);
  gl.glBindBuffer(GL.GL_ARRAY_BUFFER, 0);
 
  posMem = context.createBufferFromGLBuffer(Usage.InputOutput, vbo[0]);
  partMem = context.createFloatBuffer(Usage.InputOutput, partBuf, true);
 
  String pgmSrc = join(loadStrings(dataPath("Particle.cl")), "\n");
  CLProgram program = context.createProgram(pgmSrc);
  kernel = program.build().createKernel("updateParticle");
  kernel.setArg(0, partMem);
  kernel.setArg(1, posMem);
  kernel.setArg(2, new float [] {
    mousePos.x, mousePos.y
  }
  );
}
 
void draw() {
  background(0);
  gl.glMatrixMode(GL2.GL_PROJECTION);
  gl.glLoadIdentity();
  pgl.glu.gluOrtho2D(-halfWidth - 1, halfWidth + 1, -halfHeight - 1, halfHeight + 1);
  gl.glMatrixMode(GL2.GL_MODELVIEW);
  gl.glColor3f(1.0f, 0.8f, 0.0f);
  gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, vbo[0]);
  queue.finish();
  gl.glEnableClientState(GL2.GL_VERTEX_ARRAY);
  gl.glVertexPointer(2, GL.GL_FLOAT, 0, 0);
  gl.glDrawArrays(GL2.GL_POINTS, 0, PARTICLES_COUNT);
 
  gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
  gl.glDisableClientState(GL2.GL_VERTEX_ARRAY);
  callKernel();
}
 
void callKernel() {
  mousePos.set(float(mouseX) - halfWidth, halfHeight - float(mouseY), 0.0f);
  CLEvent kernelCompletion;
  synchronized(kernel) {
    posMem.acquireGLObject(queue);
    kernel.setArg(2, new float [] {
      mousePos.x, mousePos.y
    }
    );
    int [] globalSizes = new int[] {
      PARTICLES_COUNT
    };
    kernelCompletion = kernel.enqueueNDRange(queue, globalSizes);
    posMem.releaseGLObject(queue);
  }
}

 
The kernel source

#define DAMP			0.95f
#define CENTER_FORCE		0.005f
#define MOUSE_FORCE		200.0f
#define MIN_SPEED		0.2f
 
typedef struct{
	float2 vel;
	float mass;
	float dummy;		
} Particle;
 
__kernel void updateParticle(__global Particle* particles, 
	__global float2* posBuffer, 
	const float2 mousePos)
{
	int id = get_global_id(0);
	__global Particle *p = &particles[id];
 
	float2 diff = mousePos - posBuffer[id];
	float invDistSQ = 1.0f / dot(diff, diff);
	diff *= (MOUSE_FORCE * invDistSQ);
 
	p->vel += -posBuffer[id] * CENTER_FORCE - diff * p->mass;
 
	float speed2 = dot(p->vel, p->vel);
	if (speed2 < MIN_SPEED) 
		posBuffer[id] = mousePos + diff * (1 + p->mass);
 
	posBuffer[id] += p->vel;
	p->vel *= DAMP;
}

OpenCL Particles System with Processing

I ported the particles demo program in JavaCL to Processing 2.0 alpha. It has reasonable performance in my iMac up to 500,000 particles at twenty something frames per second. The video is captured using the QuickTime screen recording. The performance is much slower than the original screen version.
 

import processing.opengl.*;
import javax.media.opengl.*;
import javax.media.opengl.glu.GLU;
import java.util.Random;
 
import com.nativelibs4java.opencl.*;
import com.nativelibs4java.opencl.CLMem.Usage;
import org.bridj.Pointer;
 
import static org.bridj.Pointer.*;
 
final int particlesCount = 200000;
 
GL2 gl;
PGL pgl;
 
int [] vbo = new int[1];
 
CLContext context;
CLQueue queue;
 
Pointer<Float> velocities;
CLKernel updateParticleKernel;
 
CLBuffer<Float> massesMem, velocitiesMem;
CLBuffer<Byte> interleavedColorAndPositionsMem;
Pointer<Byte> interleavedColorAndPositionsTemp;
 
int elementSize = 4*4;
 
void setup() {
  size(800, 600, OPENGL);
  background(0);
  randomSeed(millis());
 
  PGraphicsOpenGL pg = (PGraphicsOpenGL) g;
  pgl = pg.beginPGL();
  gl = pgl.gl.getGL().getGL2();
  gl.glClearColor(0, 0, 0, 1);
  gl.glClear(GL.GL_COLOR_BUFFER_BIT);
  gl.glEnable(GL.GL_BLEND);
  gl.glEnable(GL2.GL_POINT_SMOOTH);
  gl.glPointSize(1f);
  initOpenCL();
  pg.endPGL();
}
 
void initOpenCL() {
  context = JavaCL.createContextFromCurrentGL();
  queue = context.createDefaultQueue();
 
  Pointer<Float> masses = allocateFloats(particlesCount).order(context.getByteOrder());
  velocities = allocateFloats(2 * particlesCount).order(context.getByteOrder());
  interleavedColorAndPositionsTemp = allocateBytes(elementSize * particlesCount).order(context.getByteOrder());
 
  Pointer<Float> positionsView = interleavedColorAndPositionsTemp.as(Float.class);
  for (int i = 0; i < particlesCount; i++) {
    masses.set(i, 0.5f + 0.5f * random(1));
    velocities.set(i * 2, random(-0.5, 0.5) * 0.2f);
    velocities.set(i * 2 + 1, random(-0.5, 0.5) * 0.2f);
    int colorOffset = i * elementSize;
    int posOffset = i * (elementSize / 4) + 1;
    byte r = (byte) 220, g = r, b = r, a = r;
    interleavedColorAndPositionsTemp.set(colorOffset++, r);
    interleavedColorAndPositionsTemp.set(colorOffset++, g);
    interleavedColorAndPositionsTemp.set(colorOffset++, b);
    interleavedColorAndPositionsTemp.set(colorOffset, a);
    float x = random(-0.5, 0.5) * width/2.0, 
    y = random(-0.5, 0.5) * height/2.0;
    positionsView.set(posOffset, (float) x);
    positionsView.set(posOffset + 1, (float) y);
  }
  velocitiesMem = context.createBuffer(Usage.InputOutput, velocities, false);
  massesMem = context.createBuffer(Usage.Input, masses, true);
 
  gl.glGenBuffers(1, vbo, 0);
  gl.glBindBuffer(GL.GL_ARRAY_BUFFER, vbo[0]);
  gl.glBufferData(GL.GL_ARRAY_BUFFER, (int) interleavedColorAndPositionsTemp.getValidBytes(), interleavedColorAndPositionsTemp.getByteBuffer(), GL2.GL_DYNAMIC_COPY);
  gl.glBindBuffer(GL.GL_ARRAY_BUFFER, 0);
 
  interleavedColorAndPositionsMem = context.createBufferFromGLBuffer(Usage.InputOutput, vbo[0]);
  String pgmSrc = join(loadStrings(dataPath("ParticlesDemoProgram.cl")), "\n");
  CLProgram program = context.createProgram(pgmSrc);
  updateParticleKernel = program.build().createKernel("updateParticle");
  callKernel();
}
 
void draw() {
  queue.finish();
  gl.glClear(GL.GL_COLOR_BUFFER_BIT);
  gl.glBlendFunc(GL.GL_SRC_ALPHA, GL.GL_SRC_COLOR);
  gl.glMatrixMode(GL2.GL_PROJECTION);
  gl.glLoadIdentity();
  pgl.glu.gluOrtho2D(-width/2 - 1, width/2 + 1, -height/2 - 1, height/2 + 1);
  gl.glMatrixMode(GL2.GL_MODELVIEW);
 
  gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, vbo[0]);
  gl.glInterleavedArrays(GL2.GL_C4UB_V2F, elementSize, 0);
  gl.glDrawArrays(GL.GL_POINTS, 0, particlesCount);
 
  gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
  callKernel();
}
 
void callKernel() {
  CLEvent kernelCompletion;
  synchronized(updateParticleKernel) {
    interleavedColorAndPositionsMem.acquireGLObject(queue);
    updateParticleKernel.setArgs(massesMem, 
    velocitiesMem, 
    interleavedColorAndPositionsMem.as(Float.class), 
    new float[] {
      mouseX-width/2, height/2-mouseY
    }
    , 
    new float[] {
      width, height
    }
    , 
    2.0, 
    2.0, 
    0.9, 
    0.8, 
    (byte) 0);
 
    int [] globalSizes = new int[] {
      particlesCount
    };
    kernelCompletion = updateParticleKernel.enqueueNDRange(queue, globalSizes);
    interleavedColorAndPositionsMem.releaseGLObject(queue);
  }
}

 
Here is the OpenCL source.

// Ported to JavaCL/OpenCL4Java (+ added colors) by Olivier Chafik
 
#define REPULSION_FORCE 4.0f
#define CENTER_FORCE2 0.0005f
 
#define PI 3.1416f
 
//#pragma OpenCL cl_khr_byte_addressable_store : enable
 
uchar4 HSVAtoRGBA(float4 hsva)
{
    float h = hsva.x, s = hsva.y, v = hsva.z, a = hsva.w;
    float r, g, b;
 
        int i;
        float f, p, q, t;
        if (s == 0) {
                // achromatic (grey)
                r = g = b = v;
                return (uchar4)(r * 255, g * 255, b * 255, a * 255);
        }
        h /= 60;                        // sector 0 to 5
        i = floor( h );
        f = h - i;                      // factorial part of h
        p = v * ( 1 - s );
        q = v * ( 1 - s * f );
        t = v * ( 1 - s * ( 1 - f ) );
        switch( i ) {
                case 0:
                        r = v;
                        g = t;
                        b = p;
                        break;
                case 1:
                        r = q;
                        g = v;
                        b = p;
                        break;
                case 2:
                        r = p;
                        g = v;
                        b = t;
                        break;
                case 3:
                        r = p;
                        g = q;
                        b = v;
                        break;
                case 4:
                        r = t;
                        g = p;
                        b = v;
                        break;
                default:                // case 5:
                        r = v;
                        g = p;
                        b = q;
                        break;
        }
    return (uchar4)(r * 255, g * 255, b * 255, a * 255);
}
 
__kernel void updateParticle(
        __global float* masses,
        __global float2* velocities,
        //__global Particle* particles,
        __global float4* particles,
        //__global char* pParticles,
        const float2 mousePos,
        const float2 dimensions,
        float massFactor,
        float speedFactor,
        float slowDownFactor,
        float mouseWeight,
        char limitToScreen
) {
	int id = get_global_id(0);
 
        float4 particle = particles[id];
 
        uchar4 color = as_uchar4(particle.x);
 
        float2 position = particle.yz;
    	float2 diff = mousePos - position;
 
        float invDistSQ = 1.0f / dot(diff, diff);
	float2 halfD = dimensions / 2.0f;
        diff *= (halfD).y * invDistSQ;
 
        float mass = massFactor * masses[id];
        float2 velocity = velocities[id];
        velocity -= mass * position * CENTER_FORCE2 - diff * mass * mouseWeight;
        position += speedFactor * velocities[id];
 
        if (limitToScreen) {
            float2 halfDims = dimensions / 2.0f;
            position = clamp(position, -halfDims, halfDims);
        }
 
        float dirDot = cross((float4)(diff, (float2)0), (float4)(velocity, (float2)0)).z;
        float speed = length(velocity);
 
        float f = speed / 4 / mass;
        float hue = (dirDot < 0 ? f : f + 1) / 2;
        hue = clamp(hue, 0.0f, 1.0f) * 360;
 
        float opacity = clamp(0.1f + f, 0.0f, 1.0f);
        float saturation = mass / 2;
        float brightness = 0.6f + opacity * 0.3f;
 
        uchar4 targetColor = HSVAtoRGBA((float4)(hue, saturation, brightness, opacity));
 
        float colorSpeedFactor = min(0.01f * speedFactor, 1.0f), otherColorSpeedFactor = 1 - colorSpeedFactor;
        color = (uchar4)(
            (uchar)(targetColor.x * colorSpeedFactor + color.x * otherColorSpeedFactor),
            (uchar)(targetColor.y * colorSpeedFactor + color.y * otherColorSpeedFactor),
            (uchar)(targetColor.z * colorSpeedFactor + color.z * otherColorSpeedFactor),
            (uchar)(targetColor.w * colorSpeedFactor + color.w * otherColorSpeedFactor)
        );
 
        particle.x = as_float(color);
        particle.yz = position;
 
    	particles[id] = particle;
 
        velocity *= slowDownFactor;
        velocities[id] = velocity;
}

OpenGL and Processing 2.0

The existing OpenGL codes for Processing do not work in the 2.0 alpha. Here is an example code segment I modify to use the new PGL class.

import processing.opengl.*;
import javax.media.opengl.*;
 
GL2 gl;
float t, s, c;
 
void setup() {
  size(400, 400, OPENGL);
  background(0);
  PGraphicsOpenGL pg = (PGraphicsOpenGL) g;
  PGL pgl = pg.beginPGL();
  gl = pgl.gl.getGL().getGL2();
  pg.endPGL();
  t = 0.0f;
  s = 0.0f;
  c = 0.0f;
}
 
void draw() {
  t += 0.01;
  s = sin(t);
  c = cos(t);
 
  gl.glClear(GL.GL_COLOR_BUFFER_BIT);
  gl.glBegin(GL.GL_TRIANGLES);
  gl.glColor3f(1, 0, 0);
  gl.glVertex3f(-c, -c, s);
  gl.glColor3f(0, 1, 0);
  gl.glVertex3f(0, c, 0);
  gl.glColor3f(0, 0, 1);
  gl.glVertex3f(s, -s, c);
  gl.glEnd();
}

OpenCL and Processing

I have done a number of testings with various Java implementations of OpenCL and Processing. The major Java bindings of OpenCL include

The are two major OpenCL libraries for Processing at the time I do the testing, MSAOpenCL using the JavaCL and openclp5 using the JOCL. I do not use the Processing libraries and call directly the Java binding codes. Each of the implementation contains a Hello World example kernel that performs calculation across a large array. In my test, I modify the kernel program to do a multiplication between two floating point numbers with an array size of a million cells.

The sample kernel program code is

__kernel void sampleKernel(__global const float *a,
        __global const float *b,
        __global float *c,
        int n)
{
	int gid = get_global_id(0);
	if (gid &gt;= n)
		return;
	c[gid] = a[gid] * b[gid];
}

I use a MacBook Pro for the testing. The operating system is OSX Lion. It includes the OpenCL implementation in the default OS. The graphics card is Nvidia GeForce 9400M with 256M graphic memory. The Processing version is alpha build 2.0a5 running in 64-bit mode. Various JAR files have to be copied to the code folder. Here is the summary.

JOCL

  • JOCL-0.1.7.jar

Java OpenCL (jogamp)

  • jocl.jar
  • gluegen-rt.jar
  • gluegen-rt-natives-macosx-universal.jar
  • libjocl.dylib

JavaCL

  • javacl-1.0.0-RC2-shaded.jar

I did ten consecutive runs of each implementation. The final figure is an average of the ten runs. Each measurement is taken just before and after the kernel execution. The first implementation (JOCL) has more fluctuated results. The second implementation has only a single command to invoke the kernel execution and passing back the result. Therefore, it is not easy to single out the execution time of the GPU. The third implementation (JavaCL) is obviously the fastest one and more stable in terms of parallel execution.

Performance with one million cells

Implementation ms
JOCL 3.207
Java OpenCL 17.0008
JavaCL 2.9865
CPU 10.952

Performance with half a million cells

Implementation ms
JOCL 3.9424
Java OpenCL 8.5672
JavaCL 2.9354
CPU 9.2368

The source files of the testings can be downloaded here:

DirectShow for Processing – OpenGL

I try to work out another version of the DirectShow for Processing classes in the last post. In this version, I write the movie data directly to an OpenGL texture object. Below is the modified version of the DMovie class. The DCapture class can also be modified in the same way.
 
The modified DMovie class

import de.humatic.dsj.*;
import java.awt.image.BufferedImage;
import com.sun.opengl.util.texture.*;
 
class DMovie implements java.beans.PropertyChangeListener {
 
  private DSMovie movie;
  public int width, height;
  public Texture tex;
 
  DMovie(String _s) {
    movie = new DSMovie(dataPath(_s), DSFiltergraph.DD7, this);
    movie.setVolume(1.0);
    movie.setLoop(false);
    movie.play();
    width = movie.getDisplaySize().width;
    height = movie.getDisplaySize().height;
    tex = TextureIO.newTexture(movie.getImage(), false);
  }
 
  public void updateImage() {
    BufferedImage bimg = movie.getImage();
    TextureData td = TextureIO.newTextureData(bimg, false);
    tex.updateImage(td);
  }
 
  public void loop() {
    movie.setLoop(true);
    movie.play();
  }
 
  public void play() {
    movie.play();
  }
 
  public void propertyChange(java.beans.PropertyChangeEvent e) {
    switch (DSJUtils.getEventType(e)) {
    }
  }
}

 
Sample code that uses the new DMovie class

import processing.opengl.*;
import javax.media.opengl.*;
 
DMovie mov;
PGraphicsOpenGL pgl;
 
void setup()
{
  size(1280, 692, OPENGL);
  pgl = (PGraphicsOpenGL) g;
  GL gl = pgl.beginGL();
  background(0);
  mov = new DMovie("Hugo.mp4");
  mov.loop();
  mov.tex.bind();
  pgl.endGL();
  ;
}
 
void draw()
{
  GL gl = pgl.beginGL();
  mov.updateImage();
  mov.tex.enable();
  gl.glBegin(GL.GL_QUADS);
  gl.glTexCoord2f(0, 0); 
  gl.glVertex2f(0, 0);
  gl.glTexCoord2f(1, 0); 
  gl.glVertex2f(width, 0);
  gl.glTexCoord2f(1, 1); 
  gl.glVertex2f(width, height);
  gl.glTexCoord2f(0, 1); 
  gl.glVertex2f(0, height);
  gl.glEnd();  
 
  mov.tex.disable();
  pgl.endGL();
}