in IT data-science ~ read.

Decoding an image of graph to table of values

I wanted to extract data from a simple image. My aim was to get the curve on the chart. Then I needed to scale so it works with temperatures. For that I needed to deconstruct numbers from image. I used cuneiform.

Here is what I end up doing.

import numpy as np  
import pandas as pd

class Aladin():  
    def __init__(self, urlOfImg, color = (255, 0, 0, 255)):

        from urllib import request
        self.base_img_path = request.urlretrieve(urlOfImg)[0]

        from PIL import Image
        self.base_img = Image.open(self.base_img_path).convert('RGBA')

        date = urlOfImg.split("/")[-1].split(".")[0]
        self.start, self.end = self.deconstruct(date)

        # cropping
        temp_box_dim = (45, 204, 75, 356)
        temp_img = self.base_img.crop(temp_box_dim)
        self.temps_path = "/tmp/temps.png"
        temp_img.save(self.temps_path)

        chart_box_dim = (80, 215, 747, 348)
        self.chart_img = self.base_img.crop(chart_box_dim)

        self.color = color

    def deconstruct(self, pr):
        date = pr[:-2]
        start_hour = str(int(pr[-2:]) + 2)
        start = pd.Timestamp(start_hour + ":00:00 " + date)
        end = start + pd.Timedelta(54, "h")

        return(start, end)

    def fit_to_temps(self, ser):
        minT = min(self.temps)
        maxT = max(self.temps)

        import numpy as npser
        ps = pd.to_datetime(np.linspace(self.start.value, self.end.value, len(ser)))
        ser.index = pd.to_datetime(np.round(ps.astype(np.int64), -9).astype('datetime64[ns]'))

        return (ser * (maxT - minT)) + minT

    def get_pixels_by_color(self, img):
        df = pd.Series()
        for x in range(img.size[0]):
            for y in range(img.size[1]):
                pxs = img.getpixel((x,y))
                if pxs == self.color:
                    df.loc[x] = -y
        return (df + img.size[1]) / img.size[1]

    def get_cunenumb(self, fn):
        import subprocess
        cune = ['cuneiform', '--singlecolumn', '-o', '/tmp/out.txt', fn]
        res = subprocess.check_output(cune)
        reader = ['cat', '/tmp/out.txt']
        read = subprocess.check_output(reader) 
        return (read.rstrip().decode("utf8"))

    def get_temps(self):
        temps_str = self.get_cunenumb(self.temps_path)
        self.temps = [int(i) for i in temps_str.split()]

    def collection(self):
        ser = self.get_pixels_by_color(self.chart_img)
        self.get_temps()
        self.data = self.fit_to_temps(ser)

    def plot(self):
        self.data.plot()

now you can just call it by:

par = Aladin("http://www.danielhnyk.cz/media/blog/decoding-an-image/2015062800.png")  
par.collection()  

and then show data by par.data or par.plot().

This was the picture I tried to decode (it's first part).