
from __future__ import with_statement # This isn't required in Python 2.6
import struct
import sys, logging
import os
from urllib2 import urlopen, HTTPError
import socket
import subprocess as sp

__author__ = "Gary Dusbabek gdusbabek@gmail.com"
__version__ = "0.1"
__copyright__ = "None. 2009 Gary Dusbabek"
__license__ = "None. I release this code into the public domain. Have at it."

"""
This is a simple generic ID3 parser that attempts to extract text tags from
mp3 files.  All strings returned are unicode, so no fretting there.  I also
took pains to create a way to capture the mp3 image, if there is one. To disable,
set 'pic_saver' to None.  There is a slight dependency on the 'convert' utility
supplied by Imagemagick to save APIC tags.

Usage:

id3 = PathId3(<path to a file>)
-or-
id3 = StreamId3(<stream of some sort--something that can be read>)
getRemoteTags(url) wraps this approach.

Then everything you want is in the id3.tags dictionary.  I'm discarding everything
but artist, ablum, title, year and track.  However, it would be simple to
throw every text frame into the dict.
"""

#logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

# frame header signatures for the various id3 versions.
V2_VERSION = struct.pack("BB", 2, 0)
V3_VERSION = struct.pack("BB", 3, 0)
V4_VERSION = struct.pack("BB", 4, 0)

# useful to detect 0x00 and 0x00,0x00.
NULL = struct.pack("B", 0)
NULLNULL = struct.pack("BB", 0, 0)

# needed to detect unicode.
BOM1 = struct.pack("BB", 255, 254)
BOM2 = struct.pack("BB", 254, 255)

# I couldn't get urllib2 to behave responsibly when a website would take too
# long to respond.
timeout = 10
socket.setdefaulttimeout(10)

def picSaver(d):
    """ saves picture data to a specified directory. """
    mime = d["mime"]
    data = d["data"]
    artist = d["artist"]
    album = d["album"]
    title = d["title"]
    ext = mime.split("/")[1];
    fname = os.path.join("/home/garyd/dusbabek_svn/mp3_blog_scraper/pic", str(hash(artist+album+title))+"."+ext)
    f = open(fname, "wb")
    f.write(data)
    f.close()
    # now resample it to be 75px.
    sp.call(["convert", "-size", "75", fname, fname])

#pic_saver = picSaver
pic_saver = None

class StreamId3:
    """Reads id3 information from a stream like object (opened file, opened url, etc.)"""
    def __init__(self, stream, path):
        fileIdentifier = stream.read(3)
        # check identifier to see if tag is present.
        if fileIdentifier != "ID3":
            # no tags. give it something empty.
            logging.debug("No tags")
            self.tags = BasicId3(stream, path).tags
        else:
            # next two bytes contain the ID3v2.x versin $03,$00 or $04,$00
            version = stream.read(2)
            if version == V2_VERSION:
                logging.debug("Id3 2.2")
                self.tags = Id3v22(stream, path).tags
            elif version == V3_VERSION:
                logging.debug("Id3 2.3")
                self.tags = Id3v23(stream, path).tags
            elif version == V4_VERSION:
                logging.debug("Id3 2.4")
                self.tags = Id3v24(stream, path).tags
            else:
                logging.debug("Not reading " + path)

class PathId3(StreamId3):
    """ Used to create an id3 from a filesystem path. """
    def __init__(self, path):
        StreamId3.__init__(self, open(path, 'rb'), path)

## 2.2
def simple(data):
    """ generic handler. echoes data. 2.2 text frames use this. """
    return data

## 2.3, 2.4
def simple_with_enc(data):
    """ return 2.3/4 text frames as unicode. """
    # 1==unicode, 2==iso8859-1, except BOM is always unicode
    enc = struct.unpack("B", data[0:1])[0]
    if len(data) > 3:
        bom = data[1:3]
    else:
        bom = None
    if bom == BOM1 or bom == BOM2:
        return unicode(data[3:], "utf16")
    elif enc > 0:
        return unicode(unicode(data[1:], "utf16"))
    else:
        # make every attempt to return good data.
        for enc in ["utf8", "iso8859-1", "cp1252"]:
            try:
                return unicode(data[1:], enc)
            except UnicodeDecodeError:
                pass
        return unicode(data[1:], errors="ignore") # I give up.

## generic
def noimpl(data):
    return ""

def pic(data):
    """ get the picture data out of a 2.2 frame."""
    sz = len(data)
    pos = 0
    enc = struct.unpack("B", data[pos])[0]
    logging.debug("ENC " + str(enc))
    pos += 1
    mime = data[pos:pos+3]
    pos += 3;
    type = struct.unpack("B", data[pos])[0]
    pos += 1 # get past the type
    desc = ""
    hasDesc = False
    while data[pos] != NULL:
        hasDesc = True
        desc += data[pos]
        pos += 1;
    desc = unicode(desc, "utf8")
    logging.debug("DESC " + desc)
    pos += 1 # get past the null.
    while hasDesc and data[pos] is NULL and pos < sz:
        pos += 1
    return {"mime" : mime, "data": data[pos:]}

def apic(data):
    """get the picture data out of 2.3 and 2.4 frames."""
    sz = len(data)
    pos = 0
    enc = struct.unpack("B", data[pos])[0]
    logging.debug("ENC " + str(enc))
    pos += 1
    mime = ""
    while data[pos] != NULL:
        mime += data[pos]
        pos += 1
    logging.debug("MIME " + mime)
    pos += 1 # get past the null
    type = struct.unpack("B", data[pos])[0]
    logging.debug("TYPE " + str(type))
    pos += 1 # get past the type
    desc = ""
    hasDesc = False
    while data[pos] != NULL:
        hasDesc = True
        desc += data[pos]
        pos += 1;
    desc = unicode(desc, "utf8")
    logging.debug("DESC " + desc)
    pos += 1 # get past the null.
    while hasDesc and data[pos] is NULL and pos < sz:
        pos += 1
    return {"mime" : mime, "data": data[pos:]}

# special frame handlers for non-text frames. 
_HANDLERSv23 = {"APIC":apic}
_HANDLERSv22 = {"PIC":pic}

class BasicId3:
    """ parsing is more or less the same regardless of id3 version.  The only
    thing that really changes are some field lengths (customized in child classes).  """
    def __init__(self, file, path):
        self.path = path
        self.tags = {"album":"", "artist":"", "track":"", "title":"", "year":"", "picture":None}

    def process(self, file):
        """ This is where the work is done. """
        # 5 bytes have already been read
        pic = None
        isSynchronized, isExtended, isExperimental, footer = grokHeaderFlag(struct.unpack("B", file.read(1))[0])
        tagSize = computeTagLength(struct.unpack("BBBB", file.read(4)))
        read = 0
        while (read < tagSize):
            frameId = file.read(self.FRAME_ID_LEN).strip()
            read += self.FRAME_ID_LEN
            if frameId == self.ZEROS:
                break
            if len(frameId) < self.FRAME_ID_LEN: break
            # if we read a sync signal, we've gone too far.
            if containsMp3SyncSignal(frameId): break
            try:
                frameSz = makeInt(struct.unpack(self.FRAME_SZ_FMT, file.read(len(self.FRAME_SZ_FMT))), 8)
                read += len(self.FRAME_SZ_FMT)
                flags, flagsLen = self.getFlags(file)
                read += flagsLen
#                logging.debug(frameId + " with " + str(frameSz))
                if frameSz == 0:
                    continue
                frameData = file.read(frameSz)
                read += frameSz
                if frameId in self.HANDLERS:
                    info = self.HANDLERS[frameId](frameData)
                    if self.HANDLERS[frameId] is apic:
                        if pic_saver:
                            pic = info
                        self.tags[self.frameMap[frameId]] = info
                elif frameId[0] == "T":
                    info = self.TXT_HANDLER(frameData)
                    logging.debug("%s is {%s}" % (frameId, info))
                else:
                    info = noimpl(frameData)
                if info and frameId in self.frameMap:
                    self.tags[self.frameMap[frameId]] = info
                else:
                    pass
#                    logging.debug("Unused tag {%s} with value {%s}" % (frameId, info))
            except OverflowError:
                print "couldn't make int."
                break;
            if tagSize - read < 10:
                # not enough space for another frame.
                read = tagSize
        if pic:
            pic_saver(dict(zip(
                ["mime","data","artist","album","title"],
                [pic["mime"], pic["data"], self.tags.get("artist"), self.tags.get("album"), self.tags.get("title")])))

    def getFlags(self, file):
        return "", 0

class CrapId3(BasicId3):
    """ A CrapId3 wraps a basic (empty) set of tags with an error message."""
    def __init__(self, errType, msg):
        self.errType = errType
        self.msg = msg
        self.tags = {"album":"", "artist":"", "track":"", "title":"", "year":"", "picture":None}

class Id3v22(BasicId3):
    """ id3 2.2 """
    def __init__(self, file, path):
        BasicId3.__init__(self, file, path)
        self.FRAME_ID_LEN = 3
        self.FRAME_SZ_FMT = "BBB"
        self.ZEROS = struct.pack("BBB", 0, 0, 0)
        self.HANDLERS = _HANDLERSv22
        self.TXT_HANDLER = simple_with_enc
        self.frameMap = {"TAL":"album", "TP1":"artist", "TRK":"track", "TT2":"title", "TYE":"year", "PIC":"picture"}
        self.process(file)

class Id3v23(BasicId3):
    """ id3 2.3 """
    def __init__(self, file, path):
        BasicId3.__init__(self, file, path)
        self.FRAME_ID_LEN = 4
        self.FRAME_SZ_FMT = "BBBB"
        self.ZEROS = struct.pack("BBBB", 0, 0, 0, 0)
        self.HANDLERS = _HANDLERSv23
        self.TXT_HANDLER = simple_with_enc
        self.frameMap = {"TALB":"album", "TPE1":"artist", "TRCK":"track", "TIT2":"title", "TYER":"year", "APIC":"picture"}
        self.process(file)

    def getFlags(self, file):
        return struct.unpack("BB", file.read(2)), 2

class Id3v24(Id3v23):
    """ id3 2.4 (more or less same as 2.3) """
    def __init__(self, file, path):
        BasicId3.__init__(self, file, path)

def containsMp3SyncSignal(string):
    """ looks for an mp3 sync signal (not part of id3) """
    if struct.unpack("B", string[0:1])[0] & 0xff == 0xff:
        if struct.unpack("B", string[1:2])[0] & 0xe0 == 0xe0:
            return True
        else:
            return False
    else:
        return False

def makeInt(bytes, shift):
    """ makes an integer out of a byte array. shift indicates the amount of
    significant bits in each byte. yes, id3 has some 28 bit integers encoded in 32 bytes. """
    res = 0;
    for byte in bytes:
        res <<= shift
        res |= byte
    return res

def grokHeaderFlag(byte):
    """ 0:synchronized, 1:extended, 2:experimental, 3:footer(2.4 only) """
    sync = (byte & 0x80) > 0
    extended = (byte & 0x40) > 0
    experimental = (byte & 0x20) > 0
    footer = (byte & 0x10) > 0
    return sync, extended, experimental, footer

def computeTagLength(bytes):
    """ four bytes. msbit is always zero. convert to 28 bit unsigned. """
    return makeInt(bytes, 7)

def getRemoteTags(path):
    """ Utility method meant to be called from the outside. """
    try:
        mp3 = urlopen(path)
        id3 = StreamId3(mp3, path)
        return id3
    except:
        logging.error("%s %s" % (str(sys.exc_info()[0]), sys.exc_info()[1]))
        return CrapId3(str(sys.exc_info()[0]), sys.exc_info()[1])

def scan_dir(dir):
    """ handy for scanning a buttload of local mp3s"""
    for dirpath, dirnames, filenames in os.walk(dir):
        for file in filenames:
            fullpath = os.path.join(dirpath, file)
            PathId3(fullpath)
        for sub in dirnames:
            scan_dir(os.path.join(dirpath,sub))
    

def main(argv=None):
#    id3 = PathId3("/home/garyd/dusbabek_svn/mp3_blog_scraper/problem_mp3s/endalaust.mp3")

#    id3 = PathId3("/home/garyd/dusbabek_svn/mp3_blog_scraper/those darlins - wild one.mp3")
#    for key in id3.tags.keys():
#        print "%s:{%s} %d" % (key, id3.tags[key], len(id3.tags[key]))
    #dir = "/Volumes/Cerebrum/itunes"
    #dir = "/Volumes/xfs3/mp3_backup/from_server/mp3/singles"
    dir = "/home/garyd/dusbabek_svn/mp3_blog_scraper/problem_mp3s"
    scan_dir(dir)
#    scan_dir("/Volumes/Cerebrum/itunes")


if __name__ == "__main__":
	sys.exit(main())
