文字エンコーディングを自動判別してファイル読み込み v1.2.1

AppleScript名:文字エンコーディングを自動判別してファイル読み込み v1.2.1
— Created 2014-12-28 by Takaaki Naganoya
— Modified 2014-12-29 by Shane Stanley
— Modified 2015-10-03 by Takaaki Naganoya
use AppleScript version "2.5"
use scripting additions
use framework "Foundation"

set aPath to POSIX path of (choose file)
set aRes to readJapanesTextFileWithGuessingEncoding(aPath) of me
set bRes to aRes as string

–Read Japanese text with detecting its text encoding
on readJapanesTextFileWithGuessingEncoding(aPOSIXpath as string)
  
  
–ISO2022JP check
  
set aNSData to current application’s NSData’s dataWithContentsOfFile:aPOSIXpath
  
set aDataLength to aNSData’s |length|()
  
if aDataLength > 1024 then set aDataLength to 1024
  
  
–0x1B check
  
set anNSString to current application’s NSString’s stringWithString:(character id 27) — 0x1B
  
set theData to anNSString’s dataUsingEncoding:(current application’s NSUTF8StringEncoding)
  
set theRange to aNSData’s rangeOfData:theData options:0 range:(current application’s NSMakeRange(0, aDataLength))
  
  
–found 0x1B in aNSData
  
if |length| of theRange = 1 and location of theRange < aDataLength then
    set aStr to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSISO2022JPStringEncoding)) –21
    
if aStr is not equal to missing value then return (aStr as text) — ISO2022JP
  end if
  
  
–EUC
  
set resValue to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSJapaneseEUCStringEncoding))
  
–log resValue
  
if resValue is not equal to missing value then return (resValue as text)
  
–UTF-8
  
set resValue to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSUTF8StringEncoding))
  
–log resValue
  
if resValue is not equal to missing value then return (resValue as text)
  
–SHift JIS
  
set resValue to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSShiftJISStringEncoding))
  
–log resValue
  
if resValue is not equal to missing value then return (resValue as text)
  
  
  
–UTF-16BE/LE/無印Unicodeは多数決を取る
  
set resValue1 to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSUTF16BigEndianStringEncoding)) as text
  
–log resValue1
  
set sample1 to getTextSample(resValue1) of me
  
set lang1 to specifyLanguageOfText(sample1) of me
  
set para1 to length of (paragraphs of sample1)
  
set words1 to length of (words of sample1)
  
  
set resValue2 to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSUTF16LittleEndianStringEncoding)) as text
  
–log resValue2
  
set sample2 to getTextSample(resValue2) of me
  
set lang2 to specifyLanguageOfText(sample2) of me
  
set para2 to length of (paragraphs of sample2)
  
set words2 to length of (words of sample2)
  
  
set resValue3 to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSUnicodeStringEncoding)) as text
  
–log resValue3
  
set sample3 to getTextSample(resValue3) of me
  
set lang3 to specifyLanguageOfText(sample3) of me
  
set para3 to length of (paragraphs of sample3)
  
set words3 to length of (words of sample3)
  
  
–文字および文法的に見て「日本語」ならそれを返す
  
if lang1 = "ja" then return resValue1
  
if lang2 = "ja" then return resValue2
  
if lang3 = "ja" then return resValue2
  
  
  
–文字化けしたときには、日本語の「Word」として認識されづらく、Paragraphも少ない(1とか)なので条件で除外する
  
if para1 is not equal to 1 then
    if (words1 words2) or (words1 words3) then
      return resValue1
    end if
  end if
  
  
if para2 is not equal to 1 then
    if (words2 words1) or (words2 words3) then
      return resValue2
    end if
  end if
  
  
if para3 is not equal to 1 then
    if (words3 words1) or (words3 words2) then
      return resValue3
    end if
  end if
  
  
return false
  
  
(*
  –おまけ(未確認)
  set resValue to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSWindowsCP1251StringEncoding))
  if resValue is not equal to missing value then return resValue
  
  set resValue to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSWindowsCP1252StringEncoding))
  if resValue is not equal to missing value then return resValue
  
  set resValue to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSWindowsCP1253StringEncoding))
  if resValue is not equal to missing value then return resValue
  
  set resValue to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSWindowsCP1254StringEncoding))
  if resValue is not equal to missing value then return resValue
  
  set resValue to (current application’s NSString’s alloc()’s initWithData:aNSData encoding:(current application’s NSWindowsCP1250StringEncoding))
  if resValue is not equal to missing value then return resValue
  
  return false
  *)

end readJapanesTextFileWithGuessingEncoding

on specifyLanguageOfText(aStr)
  set aNSstring to current application’s NSString’s stringWithString:aStr
  
set tagSchemes to current application’s NSArray’s arrayWithObjects:(current application’s NSLinguisticTagSchemeLanguage)
  
set tagger to current application’s NSLinguisticTagger’s alloc()’s initWithTagSchemes:tagSchemes options:0
  
tagger’s setString:aNSstring
  
set aLanguage to tagger’s tagAtIndex:0 |scheme|:(current application’s NSLinguisticTagSchemeLanguage) tokenRange:(missing value) sentenceRange:(missing value)
  
return aLanguage as text
end specifyLanguageOfText

on getTextSample(aText)
  set aLen to length of aText
  
if aLen < 1024 then
    set bLen to aLen
  else
    set bLen to 1024
  end if
  
return (text 1 thru bLen of aText)
end getTextSample

★Click Here to Open This Script 

Please follow and like us:

文字エンコーディングを自動判別してファイル読み込み v1.2.1」への1件のコメント

コメントを残す

メールアドレスが公開されることはありません。 * が付いている欄は必須項目です