)" KSearchEndTitle = r"(<\/title>)" KInvalidUrlChar = ' :.,\/+*' KValidUrlChar = ''.ljust(len(KInvalidUrlChar),'_') KBeginData = "" KEndData = "<div class=\"copyright-footer\">" KEndData2 = "" KEndData3 = "<div class=\"book\">" OutPutWiki = [] SlClassCSS = [] CodeMode = False IsScript = False IsNote = False IsCmd = False OkTrim = False KBaseUrl = 'http://www.howtoforge.com/' CreditsString = "" CreditsInfo = [] canWork = False PassCount = 0 PagesList = [] KCreditsHeader = 'Revisioni' BeginPage = -1 EndPage = -1 KVersion = "2.9 Python" KAuthor = "QXNjYW5pbyBQcmVzc2F0bw" KCoAuthor = "TWFzc2ltbyBGaW9yYXZhbnRp" # Transcode Table #<p class="command"></p> ==> <code></code> #<p class="system"></p> ==> Italic #<p></p> ==> Paragraph #<span class="system"></span> ==> Italic #<span class="highlight"></span> ==> Bold + Italic #<pre></pre> ==> <box></box> #<br> ==> Paragraph #<a href="" >text</a> ==> [[href|text]] #{{:WikiNameSpace:ImageName|ImageComment}} # Begin & End Of HTML to Wikize # #<div class="book"> # Sample for Credits Extraction # <div class="info">Submitted by <a href="forums/member.php?u=2" title="View user profile." rel="nofollow">falko</a> (<a href="forums/private.php?do=newpm&u=2" title="Contact author." rel="nofollow">Contact Author</a>) (<a href="forums" title="Forums.">Forums</a>) on Sun, 2012-10-21 17:33.<span class="taxonomy"> :: <a href="sitemap/control-panels/ispconfig">ISPConfig</a> | <a href="sitemap/linux/ubuntu">Ubuntu</a> | <a href="sitemap/web-server">Web Server</a> | <a href="sitemap/web-server/apache">Apache</a> | <a href="sitemap/control-panels">Control Panels</a></span></div> # # ############################################################################################### # VCL # ############################################################################################### def valueOf(List, Name): result = "" for eName, eValue in List: if eName == Name: result = eValue return result def SameText(str1, str2): result = (str1.lower() == str2.lower()) return result def IncludeTrailingPathDelimiter(aPath): result = "" result = os.path.join(aPath, '') return result def NameSpaceToPath(aNameSpace): result = "" NS = aNameSpace.split(':') for elem in NS: result = os.path.join(result, elem.replace(".", "_")) return result def ForceDirectories(aPath): if not os.path.exists(aPath): os.makedirs(aPath) # ############################################################################################### # GENUTIL # ############################################################################################### def SostituisciAllChar(St, Replaced, ReplacedTo): result = St for i in range(len(Replaced)): result = result.replace(Replaced[i], ReplacedTo[i]) return result # ############################################################################################### # PROGRAM # ############################################################################################### def ResetAllVar (): SlClassCSS = [] CodeMode = False IsScript = False IsNote = False IsCmd = False OkTrim = False def Parse_Params (): parser = argparse.ArgumentParser(usage = "%(prog)s [options]", version = "%(prog)s " + KVersion, formatter_class=argparse.RawTextHelpFormatter, description="Authors: " + base64.b64decode(KAuthor + '==') + " & " + base64.b64decode(KCoAuthor) + "\nConvert HowToForge.com Artivles in Dokuwiki Articles", epilog="Example: \n %(prog)s -u http://www.howtoforge.com/perfect-server-ubuntu-12.04-lts-apache2-bind-dovecot-ispconfig-3 -p7 \n %(prog)s -u http://www.howtoforge.com/perfect-server-ubuntu-12.04-lts-apache2-bind-dovecot-ispconfig-3 -f1 -t7") parser.add_argument("-p", "--pagecount", dest="pagecount", default=1, help="Pages to download for this article [default: %(default)s]") parser.add_argument("-f", "--frompage", dest="frompage", help="Begin Page to download for this article (For Partial Download)") parser.add_argument("-t", "--topage", dest="topage", help="End Page to download for this article (For Partial Download)") parser.add_argument("-u", "--url", dest="url", metavar="URL", help="Url to convert") parser.add_argument("-w", "--wikiNS", dest="wikiNS", help="Wiki NameSpace") parser.add_argument("-m", "--mediaNS", dest="mediaNS", help="Media NameSpace") parser.add_argument("-c", "--creditstable", dest="creditstable", action="store_true", default=True, help="Include Credits & Info Table [default: %(default)s]") parser.add_argument("-o", "--outfolder", dest="outfolder", metavar="FOLDER", help="Folder to save results") parser.add_argument("-n", "--flatnames", dest="useflatnames", help="Begin Page to download for this article (For Partial Download)") options = parser.parse_args() if (options.url == None): print "\nWarning: How the Hell I can convert something if you don't specify URL !!!!"; if (options.outfolder == None) or (options.url == None): parser.print_help() exit() if (options.outfolder == None): print "\nWarning: OutPut Folder not specified"; exit(); return options def GetTitle(Html): result = "" match = re.search(KSearchBeginTitle, Html) beginTitle = match.start(1) match = re.search(KSearchEndTitle, Html) endTitle = match.start(1) result = Html[beginTitle + 7:endTitle] result = result[:result.find('|')-1] return result def CleanHTML(Html, PassNo): result = "" if PassNo == 1: Title = GetTitle(Html) else: Title = "" match = re.search(r"[^a-zA-Z](" + KBeginData + ")[^a-zA-Z]", Html) temp = Html[match.start(1):] if PassNo == 1: temp = temp[temp.replace('<p>', 'XXX', 1).find('<p>'):] else: temp = temp[temp.replace('</div>', 'XXX', 2).find('</div>')+5:] if PassNo == 1: result = "<html><body><h1>" + Title +"</h1><div class=\"autoinserted\">" + temp else: result = "<html><body><div class=\"autoinserted\">" + temp match = re.search(r"(" + KEndData3 + ")", result) result = result[:match.start(1)] + "</body></html>" return result def ExtractFileNameFromUrl(aUrl): result = aUrl.split('/')[-1] return result def GetFlatUrlName(aUrl): result = '' UrlTokens = aUrl.split('/') del UrlTokens[0] del UrlTokens[0] FileName = UrlTokens.pop() for Token in UrlTokens: result = result + Token.replace('.', '_') + '_' result = result + FileName return result def GetPageOutputDir(aWikiNameSpace, aOutPutFolder): result = "" if (aWikiNameSpace != None): result = NameSpaceToPath(aWikiNameSpace) result = SostituisciAllChar(result, KInvalidUrlChar, KValidUrlChar) result = IncludeTrailingPathDelimiter(IncludeTrailingPathDelimiter(aOutPutFolder) + result) return result def GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder): NormalizedNameSpace = "" if (aWikiMediaNameSpace != None): NormalizedNameSpace = NameSpaceToPath(aWikiMediaNameSpace) result = GetPageOutputDir(aWikiNameSpace, aOutPutFolder) result = IncludeTrailingPathDelimiter(IncludeTrailingPathDelimiter(result) + NormalizedNameSpace) return result def GetImageName(aImgUrl, aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder, aUseFlatUrls): if aUseFlatUrls: result = GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder) + GetFlatUrlName(aImgUrl) else: result = GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder) + ExtractFileNameFromUrl(aImgUrl) return result def combOutput(aFile): fd = open(aFile) contents = fd.readlines() fd.close() new_contents = [] for line in contents: # Strip whitespace, should leave nothing if empty line was just "\n" if not line.strip(): continue # We got something, save it else: new_contents.append(line) print "Comb File: " + aFile fo = file(aFile, 'w') for x in "".join(new_contents): fo.write(x) fo.close() def SaveToFile(WikiText, aWikiNameSpace, aOutPutFolder, aUrl): MyDir = GetPageOutputDir(aWikiNameSpace, aOutPutFolder) ForceDirectories(MyDir) OutFileName = MyDir + GetFlatUrlName(aUrl).replace('.', '_') + '.txt' print "Saving File: " + OutFileName fo = file(OutFileName, 'w') for x in WikiText: fo.write(x) fo.close() combOutput(OutFileName) def DownloadImage(Url, SaveName): downloaddir = os.path.dirname(SaveName) ForceDirectories(downloaddir) result = True webFile = urllib2.urlopen(Url) data = webFile.read() try: fileName = response.info()['content-disposition'].split('filename="')[1].split('"')[0] except: fileName = ExtractFileNameFromUrl(Url) localFile = os.path.join(downloaddir, fileName.replace('%20','_')) print "Download Image: ", localFile with open(localFile, "wb") as image: image.write(data) image.close() webFile.close() return result def CustomAdjustLine(aLine): result = aLine result = aLine.replace("<vhost>", "[vhost]").replace(" //", "//").replace('<--', '<-').replace(' \\', '\\') return result def CustomAdjustments(aData): print "Custom Adjustments begin" result = CustomAdjustLine(''.join(aData)) print "Custom Adjustments end" return result class MyCreditsParser(HTMLParser): def handle_starttag(self, tag, attrs): global CreditsString global canWork if SameText(tag, 'div') and SameText(valueOf(attrs, 'class'), 'info'): canWork = True if SameText(tag, 'a') and SameText(valueOf(attrs, 'title'), 'View user profile.'): CreditsString = CreditsString + '^ Autore | [[' + KBaseUrl + valueOf(attrs, 'href') + '|' def handle_endtag(self, tag): global CreditsString global canWork global PassCount if SameText(tag, 'div'): canWork = False if SameText(tag, 'a'): if (canWork) and (PassCount <1): CreditsString = CreditsString + ']] | \n' PassCount = PassCount + 1 def handle_data(self, data): global CreditsString global canWork if (canWork): if (data.strip() != 'Submitted by') and (data.strip() != '(') and (data.strip() != ')') and (data.strip() != 'Forums') and (data.strip() !=') (') and (data.strip() !='Contact Author'): match = re.search("(on (Mon|Tue|Wed|Thu|Fri|Sat|Sun), \d{4}-\d{2}-\d{2})", data) if match and (match.groups() > 0): CreditsString = CreditsString + '^ Ultima Modifica | ' + data[match.start(0)+3:match.end(0)].replace(',', '') + ' | \n' canWork = False else: CreditsString = CreditsString + data.strip() class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): global SlClassCSS global CodeMode global IsScript global OutPutWiki global IsNote global OkTrim if SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'command'): OutPutWiki.append('\n' + KBeginCode + '\n') CodeMode = True OkTrim = True elif SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'system'): OutPutWiki.append(KParagraph + KItalic) OkTrim = True elif SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'highlight'): OutPutWiki.append(KParagraph + KBold + KItalic + KCodeStr) OkTrim = True elif SameText(tag, 'p') and (valueOf(attrs, 'class') == ""): OutPutWiki.append('\n' + KParagraph) elif SameText(tag, 'div') and SameText(valueOf(attrs, 'class'), 'sponsor'): OutPutWiki.append('\n' + KBeginNote + '\n') IsNote = True elif SameText(tag, 'h1'): OutPutWiki.append('\n' + KBeginH1) elif SameText(tag, 'h2'): OutPutWiki.append('\n' + KBeginH2) elif SameText(tag, 'h3'): OutPutWiki.append('\n' + KBeginH3) elif SameText(tag, 'h4'): if IsNote: OutPutWiki.append('\n' + KBold) else: OutPutWiki.append('\n' + KBeginH4) elif SameText(tag, 'h5'): OutPutWiki.append('\n' + KBeginH5) elif SameText(tag, 'span') and SameText(valueOf(attrs, 'class'), 'system'): OutPutWiki.append(KItalic + KCodeStr) elif SameText(tag, 'span') and SameText(valueOf(attrs, 'class'), 'highlight'): OutPutWiki.append(KBold + KItalic + KCodeStr) elif SameText(tag, 'li'): OutPutWiki.append('\n' + KSymbolList) elif SameText(tag, 'br'): if CodeMode: OutPutWiki.append('\n') else: OutPutWiki.append(KParagraph + '\n') elif SameText(tag, 'b') or SameText(tag, 'strong'): OutPutWiki.append(KBold) elif SameText(tag, 'i') or SameText(tag, 'em'): OutPutWiki.append(KItalic) elif SameText(tag, 'pre'): OutPutWiki.append('\n' + KBeginFile + '\n') CodeMode = True elif SameText(tag, 'script'): IsScript = True elif SameText(tag, 'noscript'): IsScript = True elif SameText(tag, 'a') and not SameText(valueOf(attrs, 'class'), 'thickbox'): OutPutWiki.append(KBeginLink + valueOf(attrs, 'href') + '|') elif SameText(tag, 'img'): if not SameText(valueOf(attrs, 'alt'), 'Click to enlarge'): if DownloadImage(valueOf(attrs, 'src'), GetImageName(valueOf(attrs, 'src'), MyOptions.mediaNS, MyOptions.wikiNS, MyOptions.outfolder, MyOptions.useflatnames)): MyNS = MyOptions.mediaNS if MyNS == None: MyNS = "" else: MyNS = ":" + MyNS + ":" if MyOptions.useflatnames: ImgWikiTag = '\n' + KBeginImage + MyNS + GetFlatUrlName(valueOf(attrs, 'src')) + '|' + KEndImage + '\n' OutPutWiki.append(ImgWikiTag) else: ImgWikiTag = '\n' + KBeginImage + MyNS + ExtractFileNameFromUrl(valueOf(attrs, 'src')) + '|' + KEndImage + '\n' OutPutWiki.append(ImgWikiTag) else: OutPutWiki.append('\n' + ' #IMG:' + valueOf(attrs, 'src') + ' (Failed to Download)' + '\n') if (not SameText(tag, 'br')) and (not SameText(tag, 'img')): SlClassCSS.insert(0, (tag, valueOf(attrs, 'class'))); def handle_endtag(self, tag): global SlClassCSS global CodeMode global IsScript global OutPutWiki global IsNote global OkTrim if SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'command'): OutPutWiki.append('\n' + KEndCode + '\n') CodeMode = False OkTrim = False elif SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'highlight'): OutPutWiki.append(KCodeStr + KItalic + KBold + KParagraph) OkTrim = False elif SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'system'): OutPutWiki.append(KItalic + KParagraph) OkTrim = False elif SameText(tag, 'p'): OutPutWiki.append(KParagraph + '\n') elif SameText(tag, 'div') and SameText(SlClassCSS[0][1], 'sponsor'): OutPutWiki.append('\n' + KEndNote + '\n') IsNote = False elif SameText(tag, 'h1'): OutPutWiki.append(KEndH1 +'\n') elif SameText(tag, 'h2'): OutPutWiki.append(KEndH2 +'\n') elif SameText(tag, 'h3'): OutPutWiki.append(KEndH3 +'\n') elif SameText(tag, 'h4'): if IsNote: OutPutWiki.append(KBold + '\n' + '----' + '\n') else: OutPutWiki.append(KEndH4 +'\n') elif SameText(tag, 'h5'): OutPutWiki.append(KEndH5 +'\n') elif SameText(tag, 'span') and SameText(SlClassCSS[0][1], 'system'): OutPutWiki.append(KCodeStr + KItalic) elif SameText(tag, 'span') and SameText(SlClassCSS[0][1], 'highlight'): OutPutWiki.append(KCodeStr + KItalic + KBold) elif SameText(tag, 'li'): OutPutWiki.append('\n') elif SameText(tag, 'b') or SameText(tag, 'strong'): OutPutWiki.append(KBold + ' ') elif SameText(tag, 'i') or SameText(tag, 'em'): OutPutWiki.append(KItalic) elif SameText(tag, 'pre'): OutPutWiki.append('\n' + KEndFile + '\n') CodeMode = False elif SameText(tag, 'script'): IsScript = False elif SameText(tag, 'noscript'): IsScript = False elif SameText(tag, 'a') and not SameText(SlClassCSS[0][1], 'thickbox'): OutPutWiki.append(KEndLink + ' ') if (not SameText(tag, 'br')) and (not SameText(tag, 'img')): if SameText(tag, SlClassCSS[0][0]): del SlClassCSS[0] def handle_entityref(self, name): if name == 'gt': OutPutWiki.append('>') elif name == 'lt': OutPutWiki.append('<') else: OutPutWiki.append(unichr(name2codepoint[name]).encode('utf8')) def handle_data(self, data): global CodeMode global IsScript global OkTrim if (not IsScript): if (CodeMode) or (OkTrim): OutPutWiki.append(data.strip()) else: OutPutWiki.append(data) def getbaseurl(aUrl): result = aUrl match = re.search(r"(-p[0-9])", aUrl) try: EndUrl = match.start(1) except: EndUrl = len(aUrl) result = aUrl[:EndUrl] return result def getutlforpage(aBaseUrl, aPageNo): result = getbaseurl(aBaseUrl) if aPageNo != 1: result = result + "-p" + str(aPageNo) return result def getCredits(aCreditsTable): result = KBeginH2 + KCreditsHeader + KEndH2 + "\n\n" result = result + "^ " + KCreditsHeader +" ^^ \n" + aCreditsTable + "^ Pagine ^^ \n" for Page in PagesList: result = result + "| [[" + Page + "]] || \n" return result def ConvertPage(aUrl, aPassNo): global CreditsInfo print "Converting page: ", aUrl # Track Pages PagesList.append(aUrl) # Reset Vars ResetAllVar() # Download HTML usock = urllib2.urlopen(aUrl) data = usock.read() usock.close() # Prima di eliminare tutti i dati dei credits dall'html ... if CreditsInfo == []: CreditsParser = MyCreditsParser() CreditsParser.feed(data) # Genero la tabella delle Revisioni / Credits CreditsInfo = [] CreditsInfo.append(getCredits(CreditsString)) # Clean HTML MyHtml = CleanHTML(data, aPassNo) #SaveToFile(MyHtml, MyOptions.wikiNS, MyOptions.outfolder, aUrl + "_" + str(aPassNo)) # Parse HTML parser = MyHTMLParser() parser.feed(MyHtml) def ConvertSinglePage(aUrl): global OutPutWiki ConvertPage(aUrl, 1) # Conversioni "speciali" OutPutWiki = CustomAdjustments(OutPutWiki) # Aggiungo la CreditsTable in fondo all'articolo if MyOptions.creditstable: OutPutWiki.append('\n') OutPutWiki.append(''.join(CreditsInfo)) # Save to File SaveToFile(OutPutWiki, MyOptions.wikiNS, MyOptions.outfolder, aUrl) def ConvertMultiPage(aUrl, aBeginPage, aEndPage): global OutPutWiki for idx in range(aBeginPage, aEndPage + 1): ConvertPage(getutlforpage(MyOptions.url, idx), idx) # Conversioni "speciali" OutPutWiki = CustomAdjustments(OutPutWiki) # Aggiungo la CreditsTable in fondo all'articolo if MyOptions.creditstable: OutPutWiki = OutPutWiki + '\n' OutPutWiki = OutPutWiki + ''.join(CreditsInfo) # Save to File SaveToFile(OutPutWiki, MyOptions.wikiNS, MyOptions.outfolder, getutlforpage(MyOptions.url, 1)) if __name__ == "__main__": MyOptions = Parse_Params() if (MyOptions.wikiNS != None): MyOptions.wikiNS = MyOptions.wikiNS.replace('.', '_') if (MyOptions.mediaNS != None): MyOptions.mediaNS = MyOptions.mediaNS.replace('.', '_') print os.path.basename(sys.argv[0]) + " By " + base64.b64decode(KAuthor + '==') + " & " + base64.b64decode(KCoAuthor) print "Begin Convert" if (MyOptions.frompage != None): BeginPage = int(MyOptions.frompage) if (MyOptions.topage != None): EndPage = int(MyOptions.topage) if (BeginPage != -1) and (EndPage == -1): EndPage = BeginPage if (BeginPage == -1) and (EndPage != -1): BeginPage = EndPage if (BeginPage == -1) and (EndPage == -1): if int(MyOptions.pagecount) == 0: ConvertSinglePage(MyOptions.url) else: BeginPage = 1 EndPage = int(MyOptions.pagecount) if (BeginPage == EndPage): ConvertSinglePage(MyOptions.url) else: ConvertMultiPage(MyOptions.url, BeginPage, EndPage) print "Convert done" </code> Per testare il software mi sono fatto un piccolo batch <code dos> rem HTF2Wiki.py -u http://www.howtoforge.com/perfect-server-ubuntu-12.10-apache2-bind-dovecot-ispconfig-3-p2 -p7 -o .\Test -m linux:perfect_server_ubuntu_12.10 HTF2Wiki.py -u http://www.howtoforge.com/perfect-server-ubuntu-12.10-apache2-bind-dovecot-ispconfig-3-p2 -f1 -t7 -o .\Test -m linux:perfect_server_ubuntu_12.10 </code> Le 2 righe sono equivalenti partono dalla prima pagina dell'articolo che si trova a quell'indirizzo e scaricano tutte e 7 le pagine dell'articolo in un unico file ''.txt'' scaricando anche le immagini Allego anche il {{:python:htf2wiki.7z|sorgente}} ===== Download Con ProgressBar ===== Un esempio di download con la progressbar <code python> import urllib2 url = "http://download.thinkbroadband.com/10MB.zip" file_name = url.split('/')[-1] u = urllib2.urlopen(url) f = open(file_name, 'wb') meta = u.info() file_size = int(meta.getheaders("Content-Length")[0]) print "Downloading: %s Bytes: %s" % (file_name, file_size) file_size_dl = 0 block_sz = 8192 while True: buffer = u.read(block_sz) if not buffer: break file_size_dl += len(buffer) f.write(buffer) status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) status = status + chr(8)*(len(status)+1) print status, f.close() </code>

====== Esempi di Python ====== ===== HowToForge To Wiki Converter ===== In questo esempio si vede come lavora il parser HTML integrato in Python questo programmino serve per alimentare la presente **''Wiki''** con documentazioni varie provenienti da [[http://www.howtoforge.com|HowToForge]] Nell'esempio sono state usate solo librerie integrate in Python e la versione usata è la //''2.7.x''//


#!/usr/bin/python

# ###############################################################################################
# HowToForge to Wiki converter
# ###############################################################################################
# Notes : Originaly developed in Embarcadero Delphi 2007 ... Ported to python for cross-platform
#         compatibility
# Date    : 20121128
# Author  : Ascanio Pressato
# CoAuthor: Massimo Fioravanti
# Lic.    : Creative Commen
# ###############################################################################################

import urllib2
import sys
import re
import os
import argparse 
import textwrap
import base64
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint


# Const
KBeginH1 = '===== '
KEndH1 = ' ====='
KBeginH2 = '==== '
KEndH2 = ' ===='
KBeginH3 = '=== '
KEndH3 = ' ==='
KBeginH4 = '== '
KEndH4 = ' =='
KBeginH5 = '= '
KEndH5 = ' ='

KBold = '**'
KItalic = '//'
KUnderline = '__'
KBeginStrike = ''
KEndStrike = ''
KCodeStr = "''"

KBeginLink = '[['
KEndLink = ']]'

KNumList = '  - '
KSymbolList = '  * '

KLine = '----'

KBeginKey = ''
KEndKey = ''

KBeginBox = ''
KEndBox = ''

KBeginFile = ''
KEndFile = ''

KBeginCode = ''
KEndCode = ''

KBeginNote = ''
KEndNote   = ''

KBeginImage = '{{'
KEndImage = '}}'

KParagraph = r'\\ '

KSearchBeginTitle = r"()"
KSearchEndTitle   = r"(<\/title>)"


KInvalidUrlChar = ' :.,\/+*'
KValidUrlChar   = ''.ljust(len(KInvalidUrlChar),'_')

KBeginData = "<!-- begin content -->"
KEndData   = "<div class=\"copyright-footer\">"
KEndData2  = "<!-- AddThis Button END -->"
KEndData3  = "<div class=\"book\">"

OutPutWiki = []
SlClassCSS = []
CodeMode = False
IsScript = False
IsNote   = False 
IsCmd    = False
OkTrim   = False

KBaseUrl = 'http://www.howtoforge.com/'
CreditsString = ""
CreditsInfo = []
canWork = False
PassCount = 0
PagesList = []
KCreditsHeader = 'Revisioni'


BeginPage = -1
EndPage   = -1

KVersion  = "2.9 Python"
KAuthor   = "QXNjYW5pbyBQcmVzc2F0bw"
KCoAuthor = "TWFzc2ltbyBGaW9yYXZhbnRp"

# Transcode Table
#<p class="command"></p>          ==>  <code></code>
#<p class="system"></p>           ==>  Italic
#<p></p>                          ==>  Paragraph
#<span class="system"></span>     ==>  Italic
#<span class="highlight"></span>  ==>  Bold + Italic
#<pre></pre>                      ==>  <box></box>
#<br>                             ==>  Paragraph
#<a href="" >text</a>             ==>  [[href|text]]
#{{:WikiNameSpace:ImageName|ImageComment}}

# Begin & End Of HTML to Wikize
#
#<div class="book">

# Sample for Credits Extraction
# <div class="info">Submitted by <a href="forums/member.php?u=2" title="View user profile." rel="nofollow">falko</a> (<a href="forums/private.php?do=newpm&u=2" title="Contact author." rel="nofollow">Contact Author</a>) (<a href="forums" title="Forums.">Forums</a>) on Sun, 2012-10-21 17:33.<span class="taxonomy"> :: <a href="sitemap/control-panels/ispconfig">ISPConfig</a> | <a href="sitemap/linux/ubuntu">Ubuntu</a> | <a href="sitemap/web-server">Web Server</a> | <a href="sitemap/web-server/apache">Apache</a> | <a href="sitemap/control-panels">Control Panels</a></span></div>
#

# ###############################################################################################
# VCL
# ###############################################################################################

def valueOf(List, Name):
    result = ""
    for eName, eValue in List:
        if eName == Name:
            result = eValue
    return result

def SameText(str1, str2):
    result = (str1.lower() == str2.lower())
    return result

def IncludeTrailingPathDelimiter(aPath):
    result = ""
    result = os.path.join(aPath, '')
    return result

def NameSpaceToPath(aNameSpace):
    result = ""
    NS = aNameSpace.split(':')
    for elem in NS:
        result = os.path.join(result, elem.replace(".", "_"))
    return result		

def ForceDirectories(aPath):
    if not os.path.exists(aPath):
        os.makedirs(aPath)

# ###############################################################################################
# GENUTIL
# ###############################################################################################

def SostituisciAllChar(St, Replaced, ReplacedTo):
    result = St
    for i in range(len(Replaced)):
       result = result.replace(Replaced[i], ReplacedTo[i])
    return result

# ###############################################################################################
# PROGRAM
# ###############################################################################################

def ResetAllVar ():
    SlClassCSS = []
    CodeMode = False
    IsScript = False
    IsNote   = False
    IsCmd    = False
    OkTrim   = False

def Parse_Params ():
    parser = argparse.ArgumentParser(usage = "%(prog)s [options]", version = "%(prog)s " + KVersion,
                                     formatter_class=argparse.RawTextHelpFormatter,	
									 description="Authors: " + base64.b64decode(KAuthor + '==') + " & " + base64.b64decode(KCoAuthor) + "\nConvert HowToForge.com Artivles in Dokuwiki Articles",
	                                 epilog="Example: \n   %(prog)s -u http://www.howtoforge.com/perfect-server-ubuntu-12.04-lts-apache2-bind-dovecot-ispconfig-3 -p7  \n   %(prog)s -u http://www.howtoforge.com/perfect-server-ubuntu-12.04-lts-apache2-bind-dovecot-ispconfig-3 -f1 -t7")
    parser.add_argument("-p", "--pagecount", dest="pagecount", default=1,
	                    help="Pages to download for this article [default: %(default)s]")
    parser.add_argument("-f", "--frompage", dest="frompage", 
	                    help="Begin Page to download for this article (For Partial Download)")
    parser.add_argument("-t", "--topage", dest="topage", 
	                    help="End Page to download for this article (For Partial Download)")
    parser.add_argument("-u", "--url", dest="url", metavar="URL", 
	                    help="Url to convert")
    parser.add_argument("-w", "--wikiNS", dest="wikiNS", 
	                    help="Wiki NameSpace")
    parser.add_argument("-m", "--mediaNS", dest="mediaNS", 
	                    help="Media NameSpace")
    parser.add_argument("-c", "--creditstable", dest="creditstable", action="store_true", default=True,
                        help="Include Credits & Info Table [default: %(default)s]")
    parser.add_argument("-o", "--outfolder", dest="outfolder", metavar="FOLDER",
                        help="Folder to save results")
    parser.add_argument("-n", "--flatnames", dest="useflatnames", 
	                    help="Begin Page to download for this article (For Partial Download)")
    options = parser.parse_args()

    if (options.url == None):
      print "\nWarning: How the Hell I can convert something if you don't specify URL !!!!";
	  
    if (options.outfolder == None) or (options.url == None):
      parser.print_help()
      exit()
	  
    if (options.outfolder == None):
      print "\nWarning: OutPut Folder not specified"; exit();
  
    return options

def GetTitle(Html):
    result = ""
    match = re.search(KSearchBeginTitle, Html)
    beginTitle = match.start(1)
    match = re.search(KSearchEndTitle, Html)
    endTitle = match.start(1)
    result = Html[beginTitle + 7:endTitle]
    result = result[:result.find('|')-1]
    return result
    
def CleanHTML(Html, PassNo):
    result = ""
    if PassNo == 1:
       Title = GetTitle(Html)
    else:
       Title = ""
    match = re.search(r"[^a-zA-Z](" + KBeginData + ")[^a-zA-Z]", Html)
    temp = Html[match.start(1):]
    if PassNo == 1:
       temp = temp[temp.replace('<p>', 'XXX', 1).find('<p>'):]
    else:
       temp = temp[temp.replace('</div>', 'XXX', 2).find('</div>')+5:]
    if PassNo == 1:
       result = "<html><body><h1>" + Title +"</h1><div class=\"autoinserted\">" +  temp
    else:
       result = "<html><body><div class=\"autoinserted\">" +  temp
    match = re.search(r"(" + KEndData3 + ")", result)
    result = result[:match.start(1)] + "</body></html>"
    return result

	
def ExtractFileNameFromUrl(aUrl):
    result = aUrl.split('/')[-1]
    return result

	
def GetFlatUrlName(aUrl):
    result = ''
    UrlTokens = aUrl.split('/')
    del UrlTokens[0]
    del UrlTokens[0]
    FileName = UrlTokens.pop()
    for Token in UrlTokens:
	  result = result + Token.replace('.', '_') + '_'
    result = result + FileName
    return result


def GetPageOutputDir(aWikiNameSpace, aOutPutFolder):
    result = ""
    if (aWikiNameSpace != None):
        result = NameSpaceToPath(aWikiNameSpace)
    result = SostituisciAllChar(result, KInvalidUrlChar, KValidUrlChar)
    result = IncludeTrailingPathDelimiter(IncludeTrailingPathDelimiter(aOutPutFolder) + result)
    return result

	
def GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder):
    NormalizedNameSpace = ""
    if (aWikiMediaNameSpace != None):
       NormalizedNameSpace = NameSpaceToPath(aWikiMediaNameSpace)
    result = GetPageOutputDir(aWikiNameSpace, aOutPutFolder)
    result = IncludeTrailingPathDelimiter(IncludeTrailingPathDelimiter(result) + NormalizedNameSpace)
    return result


def GetImageName(aImgUrl, aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder, aUseFlatUrls):
    if aUseFlatUrls:
       result = GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder) + GetFlatUrlName(aImgUrl)
    else:
       result = GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder) + ExtractFileNameFromUrl(aImgUrl)
    return result

def combOutput(aFile):
    fd = open(aFile)
    contents = fd.readlines()
    fd.close()
    new_contents = []
    for line in contents:
        # Strip whitespace, should leave nothing if empty line was just "\n"
        if not line.strip():
           continue
        # We got something, save it
        else:
           new_contents.append(line)
    print "Comb File: " + aFile
    fo = file(aFile, 'w')
    for x in "".join(new_contents):
      fo.write(x)
    fo.close()

		   
def SaveToFile(WikiText, aWikiNameSpace, aOutPutFolder, aUrl):
    MyDir = GetPageOutputDir(aWikiNameSpace, aOutPutFolder)
    ForceDirectories(MyDir)
    OutFileName = MyDir + GetFlatUrlName(aUrl).replace('.', '_') + '.txt'
    print "Saving File: " + OutFileName
    fo = file(OutFileName, 'w')
    for x in WikiText:
      fo.write(x)
    fo.close()
    combOutput(OutFileName)

	
def DownloadImage(Url, SaveName):
    downloaddir = os.path.dirname(SaveName)
    ForceDirectories(downloaddir)

    result = True
    webFile = urllib2.urlopen(Url)
    data = webFile.read()
    try:
       fileName = response.info()['content-disposition'].split('filename="')[1].split('"')[0]
    except:
       fileName = ExtractFileNameFromUrl(Url)
    localFile = os.path.join(downloaddir, fileName.replace('%20','_'))
    print "Download Image: ", localFile
    with open(localFile, "wb") as image:
         image.write(data)
         image.close()
    webFile.close()
    return result

def CustomAdjustLine(aLine):
    result = aLine
    result = aLine.replace("<vhost>", "[vhost]").replace("  //", "//").replace('<--', '<-').replace('  \\', '\\')
    return result
	
def CustomAdjustments(aData):
    print "Custom Adjustments begin"
    result = CustomAdjustLine(''.join(aData))
    print "Custom Adjustments end"
    return result

class MyCreditsParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        global CreditsString
        global canWork
        if SameText(tag, 'div') and SameText(valueOf(attrs, 'class'), 'info'): 
           canWork = True
        if SameText(tag, 'a') and SameText(valueOf(attrs, 'title'), 'View user profile.'): 
           CreditsString = CreditsString + '^ Autore | [[' + KBaseUrl + valueOf(attrs, 'href') + '|'       
    def handle_endtag(self, tag):
        global CreditsString
        global canWork
        global PassCount
        if SameText(tag, 'div'):
           canWork = False
        if SameText(tag, 'a'): 
           if (canWork) and (PassCount <1):
              CreditsString = CreditsString + ']] | \n'
              PassCount = PassCount + 1
    def handle_data(self, data):
        global CreditsString
        global canWork
        if (canWork):
           if (data.strip() != 'Submitted by') and (data.strip() != '(') and (data.strip() != ')') and (data.strip() != 'Forums') and (data.strip() !=') (') and (data.strip() !='Contact Author'):
              match = re.search("(on (Mon|Tue|Wed|Thu|Fri|Sat|Sun), \d{4}-\d{2}-\d{2})", data)
              if match and (match.groups() > 0):
                 CreditsString = CreditsString + '^ Ultima Modifica | ' + data[match.start(0)+3:match.end(0)].replace(',', '') + ' | \n'
                 canWork = False
              else:
                 CreditsString = CreditsString + data.strip()

				 
class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
       global SlClassCSS
       global CodeMode
       global IsScript
       global OutPutWiki
       global IsNote
       global OkTrim
       if SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'command'): 
            OutPutWiki.append('\n' + KBeginCode + '\n')
            CodeMode = True
            OkTrim = True
       elif SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'system'):
            OutPutWiki.append(KParagraph + KItalic)
            OkTrim = True
       elif SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'highlight'):
            OutPutWiki.append(KParagraph + KBold + KItalic + KCodeStr)
            OkTrim = True
       elif SameText(tag, 'p') and (valueOf(attrs, 'class') == ""):
            OutPutWiki.append('\n' + KParagraph)
       elif SameText(tag, 'div') and SameText(valueOf(attrs, 'class'), 'sponsor'):
            OutPutWiki.append('\n' + KBeginNote + '\n')
            IsNote = True
       elif SameText(tag, 'h1'):
            OutPutWiki.append('\n' + KBeginH1)
       elif SameText(tag, 'h2'):
            OutPutWiki.append('\n' + KBeginH2)
       elif SameText(tag, 'h3'):
            OutPutWiki.append('\n' + KBeginH3)
       elif SameText(tag, 'h4'):
            if IsNote:
               OutPutWiki.append('\n' + KBold)
            else:
               OutPutWiki.append('\n' + KBeginH4)
       elif SameText(tag, 'h5'):
            OutPutWiki.append('\n' + KBeginH5)
       elif SameText(tag, 'span') and SameText(valueOf(attrs, 'class'), 'system'):
            OutPutWiki.append(KItalic + KCodeStr)
       elif SameText(tag, 'span') and SameText(valueOf(attrs, 'class'), 'highlight'):
            OutPutWiki.append(KBold + KItalic + KCodeStr)
       elif SameText(tag, 'li'):
            OutPutWiki.append('\n' + KSymbolList)
       elif SameText(tag, 'br'):
            if CodeMode:
               OutPutWiki.append('\n')
            else:
               OutPutWiki.append(KParagraph + '\n')
       elif SameText(tag, 'b') or SameText(tag, 'strong'):
           OutPutWiki.append(KBold)
       elif SameText(tag, 'i') or SameText(tag, 'em'):
           OutPutWiki.append(KItalic)
       elif SameText(tag, 'pre'):
           OutPutWiki.append('\n' + KBeginFile + '\n')
           CodeMode = True
       elif SameText(tag, 'script'):
           IsScript = True
       elif SameText(tag, 'noscript'):
           IsScript = True
       elif SameText(tag, 'a') and not SameText(valueOf(attrs, 'class'), 'thickbox'):
           OutPutWiki.append(KBeginLink + valueOf(attrs, 'href') + '|')      
       elif SameText(tag, 'img'):
           if not SameText(valueOf(attrs, 'alt'), 'Click to enlarge'):
              if DownloadImage(valueOf(attrs, 'src'),  GetImageName(valueOf(attrs, 'src'), MyOptions.mediaNS, MyOptions.wikiNS, MyOptions.outfolder, MyOptions.useflatnames)):
                 MyNS = MyOptions.mediaNS
                 if MyNS == None:
                    MyNS = ""
                 else:
                    MyNS = ":" + MyNS + ":"
                 if MyOptions.useflatnames:
                    ImgWikiTag = '\n' +  KBeginImage + MyNS + GetFlatUrlName(valueOf(attrs, 'src')) + '|' + KEndImage + '\n'
                    OutPutWiki.append(ImgWikiTag)
                 else:
                    ImgWikiTag = '\n' +  KBeginImage + MyNS + ExtractFileNameFromUrl(valueOf(attrs, 'src')) + '|' + KEndImage + '\n'
                    OutPutWiki.append(ImgWikiTag)
              else:
                 OutPutWiki.append('\n' +  ' #IMG:' + valueOf(attrs, 'src') + ' (Failed to Download)' + '\n')
       if (not SameText(tag, 'br')) and (not SameText(tag, 'img')):
          SlClassCSS.insert(0, (tag, valueOf(attrs, 'class')));
    def handle_endtag(self, tag):
       global SlClassCSS
       global CodeMode
       global IsScript
       global OutPutWiki
       global IsNote
       global OkTrim
       if SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'command'): 
            OutPutWiki.append('\n' + KEndCode + '\n')
            CodeMode = False
            OkTrim = False
       elif SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'highlight'):
            OutPutWiki.append(KCodeStr + KItalic + KBold + KParagraph)
            OkTrim = False
       elif SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'system'):
            OutPutWiki.append(KItalic + KParagraph)
            OkTrim = False
       elif SameText(tag, 'p'):
            OutPutWiki.append(KParagraph + '\n')
       elif SameText(tag, 'div') and SameText(SlClassCSS[0][1], 'sponsor'):
            OutPutWiki.append('\n' + KEndNote + '\n')
            IsNote = False
       elif SameText(tag, 'h1'):
            OutPutWiki.append(KEndH1 +'\n')
       elif SameText(tag, 'h2'):
            OutPutWiki.append(KEndH2 +'\n')
       elif SameText(tag, 'h3'):
            OutPutWiki.append(KEndH3 +'\n')
       elif SameText(tag, 'h4'):
            if IsNote:
               OutPutWiki.append(KBold + '\n' + '----' + '\n')
            else:
               OutPutWiki.append(KEndH4 +'\n')
       elif SameText(tag, 'h5'):
            OutPutWiki.append(KEndH5 +'\n')
       elif SameText(tag, 'span') and SameText(SlClassCSS[0][1], 'system'):
            OutPutWiki.append(KCodeStr + KItalic)
       elif SameText(tag, 'span') and SameText(SlClassCSS[0][1], 'highlight'):
            OutPutWiki.append(KCodeStr + KItalic + KBold)
       elif SameText(tag, 'li'):
            OutPutWiki.append('\n')
       elif SameText(tag, 'b') or SameText(tag, 'strong'):
            OutPutWiki.append(KBold + ' ')
       elif SameText(tag, 'i') or SameText(tag, 'em'):
            OutPutWiki.append(KItalic)
       elif SameText(tag, 'pre'):
            OutPutWiki.append('\n' + KEndFile + '\n')
            CodeMode = False
       elif SameText(tag, 'script'):
            IsScript = False
       elif SameText(tag, 'noscript'):
            IsScript = False
       elif SameText(tag, 'a') and not SameText(SlClassCSS[0][1], 'thickbox'):
            OutPutWiki.append(KEndLink + ' ')
       if (not SameText(tag, 'br')) and (not SameText(tag, 'img')):
           if SameText(tag, SlClassCSS[0][0]):
              del SlClassCSS[0]
    def handle_entityref(self, name):
        if name == 'gt':
           OutPutWiki.append('>')
        elif name == 'lt':
           OutPutWiki.append('<')
        else:
           OutPutWiki.append(unichr(name2codepoint[name]).encode('utf8'))
    def handle_data(self, data):
        global CodeMode
        global IsScript
        global OkTrim
        if (not IsScript): 
           if (CodeMode) or (OkTrim): 
              OutPutWiki.append(data.strip())
           else: 
              OutPutWiki.append(data)

def getbaseurl(aUrl):
    result = aUrl
    match = re.search(r"(-p[0-9])", aUrl)
    try:
       EndUrl = match.start(1)
    except:
	   EndUrl = len(aUrl)
    result = aUrl[:EndUrl]
    return result

def getutlforpage(aBaseUrl, aPageNo):
    result = getbaseurl(aBaseUrl)
    if aPageNo != 1:
       result = result + "-p" + str(aPageNo)
    return result

def getCredits(aCreditsTable):
    result = KBeginH2 + KCreditsHeader + KEndH2 + "\n\n"
    result = result + "^  " + KCreditsHeader +"  ^^ \n" + aCreditsTable + "^  Pagine  ^^ \n"
    for Page in PagesList:
        result = result + "| [[" + Page + "]] || \n" 
    return result

def ConvertPage(aUrl, aPassNo):
    global CreditsInfo
    print "Converting page: ", aUrl
	# Track Pages
    PagesList.append(aUrl)
    # Reset Vars
    ResetAllVar()
    # Download HTML
    usock = urllib2.urlopen(aUrl)
    data = usock.read()
    usock.close()
	# Prima di eliminare tutti i dati dei credits dall'html ...
    if CreditsInfo == []:
       CreditsParser = MyCreditsParser()
       CreditsParser.feed(data)
	# Genero la tabella delle Revisioni / Credits
    CreditsInfo = []
    CreditsInfo.append(getCredits(CreditsString))
    # Clean HTML
    MyHtml = CleanHTML(data, aPassNo)
    #SaveToFile(MyHtml, MyOptions.wikiNS, MyOptions.outfolder, aUrl + "_" + str(aPassNo))
    # Parse HTML
    parser = MyHTMLParser()
    parser.feed(MyHtml)

def ConvertSinglePage(aUrl):
    global OutPutWiki
    ConvertPage(aUrl, 1)
    # Conversioni "speciali"
    OutPutWiki = CustomAdjustments(OutPutWiki)
	# Aggiungo la CreditsTable in fondo all'articolo
    if MyOptions.creditstable:
       OutPutWiki.append('\n')
       OutPutWiki.append(''.join(CreditsInfo))	   
    # Save to File
    SaveToFile(OutPutWiki, MyOptions.wikiNS, MyOptions.outfolder, aUrl)


def ConvertMultiPage(aUrl, aBeginPage, aEndPage):
    global OutPutWiki
    for idx in range(aBeginPage, aEndPage + 1):
        ConvertPage(getutlforpage(MyOptions.url, idx), idx)
    # Conversioni "speciali"
    OutPutWiki = CustomAdjustments(OutPutWiki)
	# Aggiungo la CreditsTable in fondo all'articolo
    if MyOptions.creditstable:
       OutPutWiki = OutPutWiki + '\n'
       OutPutWiki = OutPutWiki + ''.join(CreditsInfo)
    # Save to File
    SaveToFile(OutPutWiki, MyOptions.wikiNS, MyOptions.outfolder, getutlforpage(MyOptions.url, 1))

	
if __name__ == "__main__":
    MyOptions = Parse_Params()
	
    if (MyOptions.wikiNS != None):
      MyOptions.wikiNS = MyOptions.wikiNS.replace('.', '_')

    if (MyOptions.mediaNS != None):
      MyOptions.mediaNS = MyOptions.mediaNS.replace('.', '_')

    print os.path.basename(sys.argv[0]) + " By " + base64.b64decode(KAuthor + '==') + " & " + base64.b64decode(KCoAuthor)
    print "Begin Convert"

    if (MyOptions.frompage != None):
       BeginPage = int(MyOptions.frompage)
    if (MyOptions.topage != None):
       EndPage = int(MyOptions.topage)

    if (BeginPage != -1) and (EndPage == -1):
       EndPage = BeginPage
    if (BeginPage == -1) and (EndPage != -1):
       BeginPage = EndPage
	   
    if (BeginPage == -1) and (EndPage == -1):
       if int(MyOptions.pagecount) == 0:
          ConvertSinglePage(MyOptions.url)
       else:
          BeginPage = 1
          EndPage   = int(MyOptions.pagecount)
	   
    if (BeginPage == EndPage):
       ConvertSinglePage(MyOptions.url)
    else:
       ConvertMultiPage(MyOptions.url, BeginPage, EndPage)
    print "Convert done"
</code>

Per testare il software mi sono fatto un piccolo batch

<code dos>
rem HTF2Wiki.py -u http://www.howtoforge.com/perfect-server-ubuntu-12.10-apache2-bind-dovecot-ispconfig-3-p2 -p7 -o .\Test -m linux:perfect_server_ubuntu_12.10
HTF2Wiki.py -u http://www.howtoforge.com/perfect-server-ubuntu-12.10-apache2-bind-dovecot-ispconfig-3-p2 -f1 -t7 -o .\Test -m linux:perfect_server_ubuntu_12.10
</code>

Le 2 righe sono equivalenti partono dalla prima pagina dell'articolo che si trova a quell'indirizzo e scaricano tutte e 7 le pagine dell'articolo in un unico file ''.txt'' scaricando anche le immagini

Allego anche il {{:python:htf2wiki.7z|sorgente}}

===== Download Con ProgressBar =====

Un esempio di download con la progressbar

<code python>

import urllib2

url = "http://download.thinkbroadband.com/10MB.zip"

file_name = url.split('/')[-1]
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (file_name, file_size)

file_size_dl = 0
block_sz = 8192
while True:
    buffer = u.read(block_sz)
    if not buffer:
        break

    file_size_dl += len(buffer)
    f.write(buffer)
    status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
    status = status + chr(8)*(len(status)+1)
    print status,

f.close()
</code>